Esempio n. 1
0
    def test_predict_mocked(self, rf_mock):
        """Use mock to count the number of calls to _predict"""
        class SideEffect(object):
            def __init__(self):
                self.counter = 0

            def __call__(self, X):
                self.counter += 1
                # Return mean and variance
                return self.counter, self.counter

        rf_mock.side_effect = SideEffect()

        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        Y = rs.rand(10, 1)
        model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint))
        model.train(X[:10], Y[:10])
        m_hat, v_hat = model.predict(X[10:])
        self.assertEqual(m_hat.shape, (10, 1))
        self.assertEqual(v_hat.shape, (10, 1))
        self.assertEqual(rf_mock.call_count, 10)
        for i in range(10):
            self.assertEqual(m_hat[i], i + 1)
            self.assertEqual(v_hat[i], i + 1)
Esempio n. 2
0
    def test_predict_with_actual_values(self):
        print()
        X = np.array([[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.],
                      [1., 0., 0.], [1., 0., 1.], [1., 1., 0.], [1., 1., 1.]],
                     dtype=np.float64)
        y = np.array(
            [[.1], [.2], [9], [9.2], [100.], [100.2], [109.], [109.2]],
            dtype=np.float64)
        # print(X.shape, y.shape)
        model = RandomForestWithInstances(types=np.array([0, 0, 0],
                                                         dtype=np.uint),
                                          bounds=np.array([(0, np.nan),
                                                           (0, np.nan),
                                                           (0, np.nan)],
                                                          dtype=object),
                                          instance_features=None,
                                          seed=12345)
        model.train(np.vstack((X, X, X, X, X, X, X, X)),
                    np.vstack((y, y, y, y, y, y, y, y)))
        # for idx, x in enumerate(X):
        #     print(model.rf.all_leaf_values(x))
        #     print(x, model.predict(np.array([x]))[0], y[idx])

        y_hat, _ = model.predict(X)
        for y_i, y_hat_i in zip(
                y.reshape((1, -1)).flatten(),
                y_hat.reshape((1, -1)).flatten()):
            # print(y_i, y_hat_i)
            self.assertAlmostEqual(y_i, y_hat_i, delta=0.1)
Esempio n. 3
0
    def test_predict_with_actual_values(self):
        X = np.array([
            [0., 0., 0.],
            [0., 0., 1.],
            [0., 1., 0.],
            [0., 1., 1.],
            [1., 0., 0.],
            [1., 0., 1.],
            [1., 1., 0.],
            [1., 1., 1.]], dtype=np.float64)
        y = np.array([
            [.1],
            [.2],
            [9],
            [9.2],
            [100.],
            [100.2],
            [109.],
            [109.2]], dtype=np.float64)
        model = RandomForestWithInstances(
            configspace=self._get_cs(3),
            types=np.array([0, 0, 0], dtype=np.uint),
            bounds=[(0, np.nan), (0, np.nan), (0, np.nan)],
            instance_features=None,
            seed=12345,
            ratio_features=1.0,
        )
        model.train(np.vstack((X, X, X, X, X, X, X, X)), np.vstack((y, y, y, y, y, y, y, y)))

        y_hat, _ = model.predict(X)
        for y_i, y_hat_i in zip(y.reshape((1, -1)).flatten(), y_hat.reshape((1, -1)).flatten()):
            self.assertAlmostEqual(y_i, y_hat_i, delta=0.1)
Esempio n. 4
0
    def test_predict_marginalized_over_instances_mocked(self, rf_mock):
        """Use mock to count the number of calls to predict()"""
        class SideEffect(object):
            def __call__(self, X):
                # Numpy array of number 0 to X.shape[0]
                rval = np.array(list(range(X.shape[0]))).reshape((-1, 1))
                # Return mean and variance
                return rval, rval

        rf_mock.side_effect = SideEffect()

        rs = np.random.RandomState(1)
        F = rs.rand(10, 5)

        model = RandomForestWithInstances(
            configspace=self._get_cs(10),
            types=np.zeros((15, ), dtype=np.uint),
            instance_features=F,
            bounds=list(map(lambda x: (0, 10), range(10))),
            seed=1,
        )
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.randint(1, size=(len(X) * len(F), 1)) * 1.
        X_ = rs.rand(200, 15)
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(rs.rand(
            11, 10))
        # expected to be 0 as the predict is replaced by manual unloggin the trees
        self.assertEqual(rf_mock.call_count, 0)
        self.assertEqual(means.shape, (11, 1))
        self.assertEqual(vars.shape, (11, 1))
        for i in range(11):
            self.assertEqual(means[i], 0.)
            self.assertEqual(vars[i], 1.e-10)
Esempio n. 5
0
 def test_predict(self):
     rs = np.random.RandomState(1)
     X = rs.rand(20, 10)
     Y = rs.rand(10, 1)
     model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint))
     model.train(X[:10], Y[:10])
     m_hat, v_hat = model.predict(X[10:])
     self.assertEqual(m_hat.shape, (10, 1))
     self.assertEqual(v_hat.shape, (10, 1))
Esempio n. 6
0
    def test_predict_marginalized_over_instances_no_features(self, rf_mock):
        """The RF should fall back to the regular predict() method."""

        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        Y = rs.rand(10, 1)
        model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint))
        model.train(X[:10], Y[:10])
        model.predict(X[10:])
        self.assertEqual(rf_mock.call_count, 1)
    def test_rf_on_sklearn_data(self):
        import sklearn.datasets
        X, y = sklearn.datasets.load_boston(return_X_y=True)
        rs = np.random.RandomState(1)

        types = np.zeros(X.shape[1])
        bounds = [(np.min(X[:, i]), np.max(X[:, i]))
                  for i in range(X.shape[1])]

        cv = sklearn.model_selection.KFold(shuffle=True,
                                           random_state=rs,
                                           n_splits=2)

        for do_log in [False, True]:
            if do_log:
                targets = np.log(y)
                model = RandomForestWithInstances(
                    configspace=self._get_cs(X.shape[1]),
                    types=types,
                    bounds=bounds,
                    seed=1,
                    ratio_features=1.0,
                    pca_components=100,
                    log_y=True,
                )
                maes = [0.43169704431695493156, 0.4267519520332511912]
            else:
                targets = y
                model = RandomForestWithInstances(
                    configspace=self._get_cs(X.shape[1]),
                    types=types,
                    bounds=bounds,
                    seed=1,
                    ratio_features=1.0,
                    pca_components=100,
                )
                maes = [9.3298376833224042496, 9.348010654109179346]

            for i, (train_split, test_split) in enumerate(cv.split(X,
                                                                   targets)):
                X_train = X[train_split]
                y_train = targets[train_split]
                X_test = X[test_split]
                y_test = targets[test_split]
                model.train(X_train, y_train)
                y_hat, mu_hat = model.predict(X_test)
                mae = np.mean(np.abs(y_hat - y_test), dtype=np.float128)
                self.assertAlmostEqual(
                    mae,
                    maes[i],
                    msg=('Do log: %s, iteration %i' % (str(do_log), i)),
                    # We observe a difference of around 0.00017
                    # in github actions if doing log
                    places=3 if do_log else 7)
Esempio n. 8
0
 def test__predict(self):
     rs = np.random.RandomState(1)
     X = rs.rand(20, 10)
     Y = rs.rand(10, 1)
     model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint))
     model.train(X[:10], Y[:10])
     m_hat, v_hat = model._predict(X[10])
     self.assertIsInstance(m_hat, float)
     self.assertIsInstance(v_hat, float)
     self.assertRaisesRegexp(
         ValueError, 'Buffer has wrong number of '
         'dimensions \(expected 1, got 2\)', model._predict, X[10:])
Esempio n. 9
0
 def test_predict(self):
     rs = np.random.RandomState(1)
     X = rs.rand(20, 10)
     Y = rs.rand(10, 1)
     model = RandomForestWithInstances(
         types=np.zeros((10, ), dtype=np.uint),
         bounds=list(map(lambda x: (0, 10), range(10))),
     )
     model.train(X[:10], Y[:10])
     m_hat, v_hat = model.predict(X[10:])
     self.assertEqual(m_hat.shape, (10, 1))
     self.assertEqual(v_hat.shape, (10, 1))
Esempio n. 10
0
    def test_predict_marginalized_over_instances(self):
        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.rand(len(X) * len(F), 1)
        X_ = rs.rand(200, 15)

        model = RandomForestWithInstances(np.zeros((15, ), dtype=np.uint),
                                          instance_features=F)
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(X)
        self.assertEqual(means.shape, (20, 1))
        self.assertEqual(vars.shape, (20, 1))
Esempio n. 11
0
    def test_with_ordinal(self):
        cs = smac.configspace.ConfigurationSpace()
        _ = cs.add_hyperparameter(
            CategoricalHyperparameter('a', [0, 1], default_value=0))
        _ = cs.add_hyperparameter(
            OrdinalHyperparameter('b', [0, 1], default_value=1))
        _ = cs.add_hyperparameter(
            UniformFloatHyperparameter('c',
                                       lower=0.,
                                       upper=1.,
                                       default_value=1))
        _ = cs.add_hyperparameter(
            UniformIntegerHyperparameter('d',
                                         lower=0,
                                         upper=10,
                                         default_value=1))
        cs.seed(1)

        feat_array = np.array([0, 0, 0]).reshape(1, -1)
        types, bounds = get_types(cs, feat_array)
        model = RandomForestWithInstances(
            configspace=cs,
            types=types,
            bounds=bounds,
            instance_features=feat_array,
            seed=1,
            ratio_features=1.0,
            pca_components=9,
        )
        self.assertEqual(bounds[0][0], 2)
        self.assertTrue(bounds[0][1] is np.nan)
        self.assertEqual(bounds[1][0], 0)
        self.assertEqual(bounds[1][1], 1)
        self.assertEqual(bounds[2][0], 0.)
        self.assertEqual(bounds[2][1], 1.)
        self.assertEqual(bounds[3][0], 0.)
        self.assertEqual(bounds[3][1], 1.)
        X = np.array(
            [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0.],
             [0., 1., 0., 9., 0., 0., 0.], [0., 1., 1., 4., 0., 0., 0.]],
            dtype=np.float64)
        y = np.array([0, 1, 2, 3], dtype=np.float64)

        X_train = np.vstack((X, X, X, X, X, X, X, X, X, X))
        y_train = np.vstack((y, y, y, y, y, y, y, y, y, y))

        model.train(X_train, y_train.reshape((-1, 1)))
        mean, _ = model.predict(X)
        for idx, m in enumerate(mean):
            self.assertAlmostEqual(y[idx], m, 0.05)
Esempio n. 12
0
    def test_predict_marginalized_over_instances_no_features(self, rf_mock):
        """The RF should fall back to the regular predict() method."""

        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        Y = rs.rand(10, 1)
        model = RandomForestWithInstances(
            configspace=self._get_cs(10),
            types=np.zeros((10, ), dtype=np.uint),
            bounds=list(map(lambda x: (0, 10), range(10))),
            seed=1,
        )
        model.train(X[:10], Y[:10])
        model.predict(X[10:])
        self.assertEqual(rf_mock.call_count, 1)
Esempio n. 13
0
    def test_train_with_pca(self):
        rs = np.random.RandomState(1)
        X = rs.rand(20, 20)
        F = rs.rand(10, 10)
        Y = rs.rand(20, 1)
        model = RandomForestWithInstances(
            types=np.zeros((20, ), dtype=np.uint),
            bounds=list(map(lambda x: (0, 10), range(10))),
            pca_components=2,
            instance_features=F,
        )
        model.train(X, Y)

        self.assertEqual(model.n_params, 10)
        self.assertEqual(model.n_feats, 10)
        self.assertIsNotNone(model.pca)
        self.assertIsNotNone(model.scaler)
Esempio n. 14
0
    def test_predict_marginalized_over_instances(self):
        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.rand(len(X) * len(F), 1)
        X_ = rs.rand(200, 15)

        model = RandomForestWithInstances(
            configspace=self._get_cs(10),
            types=np.zeros((15, ), dtype=np.uint),
            instance_features=F,
            bounds=list(map(lambda x: (0, 10), range(10))),
            seed=1,
        )
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(X)
        self.assertEqual(means.shape, (20, 1))
        self.assertEqual(vars.shape, (20, 1))
Esempio n. 15
0
    def get_pred_surface(self, rh, X_scaled, conf_list: list,
                         contour_step_size):
        """fit epm on the scaled input dimension and
        return data to plot a contour plot of the empirical performance

        Parameters
        ----------
        rh: RunHistory
            runhistory
        X_scaled: np.array
            configurations in scaled 2dim
        conf_list: list
            list of Configuration objects

        Returns
        -------
        contour_data: (np.array, np.array, np.array)
            x, y, Z for contour plots
        """
        # use PCA to reduce features to also at most 2 dims
        scen = copy.deepcopy(self.scenario)  # pca changes feats
        if scen.feature_array.shape[1] > 2:
            self.logger.debug(
                "Use PCA to reduce features to from %d dim to 2 dim",
                scen.feature_array.shape[1])
            # perform PCA
            insts = scen.feature_dict.keys()
            feature_array = np.array([scen.feature_dict[i] for i in insts])
            feature_array = StandardScaler().fit_transform(feature_array)
            feature_array = PCA(n_components=2).fit_transform(feature_array)
            # inject in scenario-object
            scen.feature_array = feature_array
            scen.feature_dict = dict([(inst, feature_array[idx, :])
                                      for idx, inst in enumerate(insts)])
            scen.n_features = 2

        # convert the data to train EPM on 2-dim featurespace (for contour-data)
        self.logger.debug("Convert data for epm.")
        X, y, types = convert_data_for_epm(scenario=scen,
                                           runhistory=rh,
                                           logger=self.logger)
        types = np.array(np.zeros((2 + scen.feature_array.shape[1])),
                         dtype=np.uint)
        num_params = len(scen.cs.get_hyperparameters())

        # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions
        conf_dict = {}
        for idx, c in enumerate(conf_list):
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        X_trans = []
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            # append scaled config + pca'ed features (total of 4 values) per config/feature-sample
            X_trans.append(
                np.concatenate((x_scaled_conf, x[num_params:]), axis=0))
        X_trans = np.array(X_trans)

        self.logger.debug("Train random forest for contour-plot.")
        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(types=types,
                                          bounds=bounds,
                                          instance_features=np.array(
                                              scen.feature_array),
                                          ratio_features=1.0)

        start = time.time()
        model.train(X_trans, y)
        self.logger.debug("Fitting random forest took %f time",
                          time.time() - start)

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size),
                             np.arange(y_min, y_max, contour_step_size))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min,
                          x_max, y_min, y_max)
        self.logger.debug(
            "Predict on %d samples in grid to get surface (step-size: %f)",
            np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size)

        start = time.time()
        Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(),
                                                               yy.ravel()])
        Z = Z.reshape(xx.shape)
        self.logger.debug("Predicting random forest took %f time",
                          time.time() - start)

        return xx, yy, Z
Esempio n. 16
0
    def run(self, save_fn: str = None):
        '''
            forward selection on SMAC's EPM (RF) wrt configuration space
            to minimize the out-of-bag error returned by the RF

            Parameters
            ----------
            save_fn:str
                file name to save plot

            Returns
            -------
            list 
                tuples of parameter name and oob score
        '''

        importance_tuples = []
        X = self.X
        y = self.Y

        param_ids = list(range(len(self.params)))
        used = []
        # always use all features
        used.extend(range(len(self.params), len(self.types)))

        pca = PCA(n_components=min(7, len(self.types) - len(self.params)))
        self.scen.feature_array = pca.fit_transform(self.scen.feature_array)

        for _ in range(self._MAX_P):
            scores = []
            for p in param_ids:

                self.logger.debug(self.params[p])
                used.append(p)
                X_l = X[:, used]

                model = RandomForestWithInstances(self.types[used],
                                                  self.scen.feature_array)
                model.rf.compute_oob_error = True

                start = time.time()
                model.train(X_l, y)
                self.logger.debug(
                    "End Fit RF (sec %.2f; oob: %.4f)" %
                    (time.time() - start, model.rf.out_of_bag_error()))

                #==============================================================
                # start = time.time()
                # rf = RandomForestRegressor(n_estimators=30,
                #                            min_samples_split=3,
                #                            min_samples_leaf=3,
                #                            max_features=math.ceil(
                #                                (5. / 6.) * X_l.shape[1]),
                #                            max_leaf_nodes=1000,
                #                            max_depth=20, oob_score=True)
                # rf.fit(X_l, y.ravel())
                # self.logger.debug("End Fit Sklearn RF (sec %.2f, oob: %.4f))" % (
                #     time.time() - start, rf.oob_score_))
                #==============================================================

                score = model.rf.out_of_bag_error()
                scores.append(score)
                used.pop()

            best_indx = np.argmin(scores)
            best_score = scores[best_indx]
            p = param_ids.pop(best_indx)
            used.append(p)

            self.logger.info("%s : %.4f (OOB)" %
                             (self.params[p].name, best_score))
            importance_tuples.append((self.params[p].name, best_score))

        self.plot_importance(importance_tuples=importance_tuples,
                             save_fn=save_fn)
        return importance_tuples
class AbstractEvaluator(object):
    """
    Abstract implementation of Importance evaluator
    """
    def __init__(self,
                 scenario: Scenario,
                 cs: ConfigurationSpace,
                 model: RandomForestWithInstances,
                 to_evaluate: int,
                 rng,
                 verbose: bool = True,
                 **kwargs):
        self._logger = None
        self.scenario = scenario
        self.cs = cs
        self.model = model  # SMAC model
        self.rng = rng
        self.verbose = verbose

        if self.model is not None:
            if 'X' in kwargs and 'y' in kwargs:
                self._train_model(kwargs['X'], kwargs['y'], **kwargs)
            if 'features' in kwargs:
                self.features = kwargs['features']
            else:
                self.features = self.model.instance_features

            self.X = self.model.X
            self.y = self.model.y
            self.types = self.model.types
            self.bounds = self.model.bounds
        self._to_eval = to_evaluate
        if to_evaluate <= 0:
            self.to_evaluate = len(self.cs.get_hyperparameters())
        elif to_evaluate >= len(self.cs.get_hyperparameters()):
            self.to_evaluate = len(self.cs.get_hyperparameters())
        else:
            self.to_evaluate = to_evaluate  # num of parameters to evaluate

        self.evaluated_parameter_importance = OrderedDict()
        self.name = 'Base'

        self.IMPORTANCE_THRESHOLD = 0.05
        self.AXIS_FONT = {'family': 'monospace'}
        self.LABEL_FONT = {'family': 'sans-serif'}
        self.LINE_FONT = {'lw': 4, 'color': (0.125, 0.125, 0.125)}
        self.area_color = (0.25, 0.25, 0.45)
        self.unimportant_area_color = (0.125, 0.125, 0.225)
        self.MAX_PARAMS_TO_PLOT = 15

    @abc.abstractclassmethod
    def run(self) -> OrderedDict:
        raise NotImplementedError

    @abc.abstractclassmethod
    def plot_result(self, name=None):
        raise NotImplementedError

    def _train_model(self, X, y, **kwargs):
        self.model.train(X, y, **kwargs)

    def __str__(self):
        tmp = 'Parameter Importance Evaluation Method %s\n' % self.name
        tmp += '{:^15s}: {:<8s}\n'.format('Parameter', 'Value')
        for key in self.evaluated_parameter_importance:
            value = self.evaluated_parameter_importance[key]
            tmp += '{:>15s}: {:<3.4f}\n'.format(key, value)
        return tmp

    @property
    def logger(self):
        return self._logger

    @logger.setter
    def logger(self, value):
        self._logger = logging.getLogger(value)

    def _refit_model(self, types, bounds, X, y):
        """
        Easily allows for refitting of the model.
        Parameters
        ----------
        types: list
            SMAC EPM types
        X:ndarray
            X matrix
        y:ndarray
            corresponding y vector
        """
        # We need to fake config-space bypass imputation of inactive values in random forest implementation
        fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint")
        # We need to add fake hyperparameters
        fake_cs.add_hyperparameters([
            UniformFloatHyperparameter('fake-%s' % i,
                                       lower=0.,
                                       upper=100000.,
                                       default_value=0.,
                                       log=False) for i in range(len(types))
        ])

        self.model = RandomForestWithInstances(fake_cs,
                                               types,
                                               bounds,
                                               seed=12345,
                                               do_bootstrapping=True)
        self.model.rf_opts.compute_oob_error = True
        self.model.train(X, y)
Esempio n. 18
0
    def validate_epm(
        self,
        config_mode: Union[str, typing.List[Configuration]] = 'def',
        instance_mode: Union[str, typing.List[str]] = 'test',
        repetitions: int = 1,
        runhistory: typing.Optional[RunHistory] = None,
        output_fn: typing.Optional[str] = None,
        reuse_epm: bool = True,
    ) -> RunHistory:
        """
        Use EPM to predict costs/runtimes for unknown config/inst-pairs.

        side effect: if output is specified, saves runhistory to specified
        output directory.

        Parameters
        ----------
        output_fn: str
            path to runhistory to be saved. if the suffix is not '.json', will
            be interpreted as directory and filename will be
            'validated_runhistory_EPM.json'
        config_mode: str or list<Configuration>
            string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all].
            time evaluates at cpu- or wallclock-timesteps of:
            [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time
        instance_mode: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of repetitions in nondeterministic algorithms
        runhistory: RunHistory
            optional, RunHistory-object to reuse runs
        reuse_epm: bool
            if true (and if `self.epm`), reuse epm to validate runs

        Returns
        -------
        runhistory: RunHistory
            runhistory with predicted runs
        """
        if not isinstance(runhistory, RunHistory) and (self.epm is None
                                                       or not reuse_epm):
            raise ValueError(
                "No runhistory specified for validating with EPM!")
        elif not reuse_epm or self.epm is None:
            # Create RandomForest
            types, bounds = get_types(
                self.scen.cs, self.scen.feature_array
            )  # type: ignore[attr-defined] # noqa F821
            epm = RandomForestWithInstances(
                configspace=self.scen.
                cs,  # type: ignore[attr-defined] # noqa F821
                types=types,
                bounds=bounds,
                instance_features=self.scen.feature_array,
                seed=self.rng.randint(MAXINT),
                ratio_features=1.0,
            )
            # Use imputor if objective is runtime
            imputor = None
            impute_state = None
            impute_censored_data = False
            if self.scen.run_obj == 'runtime':
                threshold = self.scen.cutoff * self.scen.par_factor  # type: ignore[attr-defined] # noqa F821
                imputor = RFRImputator(
                    rng=self.rng,
                    cutoff=self.scen.
                    cutoff,  # type: ignore[attr-defined] # noqa F821
                    threshold=threshold,
                    model=epm)
                impute_censored_data = True
                impute_state = [StatusType.CAPPED]
                success_states = [
                    StatusType.SUCCESS,
                ]
            else:
                success_states = [
                    StatusType.SUCCESS, StatusType.CRASHED, StatusType.MEMOUT
                ]

            # Transform training data (from given rh)
            rh2epm = RunHistory2EPM4Cost(
                num_params=len(self.scen.cs.get_hyperparameters()
                               ),  # type: ignore[attr-defined] # noqa F821
                scenario=self.scen,
                rng=self.rng,
                impute_censored_data=impute_censored_data,
                imputor=imputor,
                impute_state=impute_state,
                success_states=success_states)
            assert runhistory is not None  # please mypy
            X, y = rh2epm.transform(runhistory)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))
            # Train random forest
            epm.train(X, y)
        else:
            epm = typing.cast(RandomForestWithInstances, self.epm)

        # Predict desired runs
        runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions,
                                      runhistory)

        feature_array_size = len(self.scen.cs.get_hyperparameters()
                                 )  # type: ignore[attr-defined] # noqa F821
        if self.scen.feature_array is not None:
            feature_array_size += self.scen.feature_array.shape[1]

        X_pred = np.empty((len(runs), feature_array_size))
        for idx, run in enumerate(runs):
            if self.scen.feature_array is not None and run.inst is not None:
                X_pred[idx] = np.hstack([
                    convert_configurations_to_array([run.config])[0],
                    self.scen.feature_dict[run.inst]
                ])
            else:
                X_pred[idx] = convert_configurations_to_array([run.config])[0]
        self.logger.debug("Predicting desired %d runs, data has shape %s",
                          len(runs), str(X_pred.shape))

        y_pred = epm.predict(X_pred)
        self.epm = epm

        # Add runs to runhistory
        for run, pred in zip(runs, y_pred[0]):
            rh_epm.add(
                config=run.config,
                cost=float(pred),
                time=float(pred),
                status=StatusType.SUCCESS,
                instance_id=run.inst,
                seed=-1,
                additional_info={"additional_info": "ESTIMATED USING EPM!"})

        if output_fn:
            self._save_results(rh_epm,
                               output_fn,
                               backup_fn="validated_runhistory_EPM.json")
        return rh_epm
Esempio n. 19
0
class SMBO(BaseSolver):
    def __init__(self,
                 scenario,
                 tae_runner=None,
                 acquisition_function=None,
                 model=None,
                 runhistory2epm=None,
                 stats=None,
                 rng=None):
        '''
        Interface that contains the main Bayesian optimization loop

        Parameters
        ----------
        scenario: smac.scenario.scenario.Scenario
            Scenario object
        tae_runner: object
            object that implements the following method to call the target
            algorithm (or any other arbitrary function):
            run(self, config)
            If not set, it will be initialized with the tae.ExecuteTARunOld()
        acquisition_function : AcquisitionFunction
            Object that implements the AbstractAcquisitionFunction. Will use
            EI if not set.
        model : object
            Model that implements train() and predict(). Will use a
            RandomForest if not set.
        runhistory2epm : RunHistory2EMP
            Object that implements the AbstractRunHistory2EPM. If None,
            will use RunHistory2EPM4Cost if objective is cost or
            RunHistory2EPM4LogCost if objective is runtime.
        stats: Stats
            optional stats object
        rng: numpy.random.RandomState
            Random number generator
        '''

        if stats:
            self.stats = stats
        else:
            self.stats = Stats(scenario)

        self.runhistory = RunHistory()

        self.logger = logging.getLogger("smbo")

        if rng is None:
            self.num_run = np.random.randint(1234567980)
            self.rng = np.random.RandomState(seed=self.num_run)
        elif isinstance(rng, int):
            self.num_run = rng
            self.rng = np.random.RandomState(seed=rng)
        elif isinstance(rng, np.random.RandomState):
            self.num_run = rng.randint(1234567980)
            self.rng = rng
        else:
            raise TypeError('Unknown type %s for argument rng. Only accepts '
                            'None, int or np.random.RandomState' %
                            str(type(rng)))

        self.scenario = scenario
        self.config_space = scenario.cs
        self.traj_logger = TrajLogger(output_dir=self.scenario.output_dir,
                                      stats=self.stats)

        self.types = get_types(self.config_space, scenario.feature_array)
        if model is None:
            self.model = RandomForestWithInstances(
                self.types,
                scenario.feature_array,
                seed=self.rng.randint(1234567980))
        else:
            self.model = model

        if acquisition_function is None:
            self.acquisition_func = EI(self.model)
        else:
            self.acquisition_func = acquisition_function

        self.local_search = LocalSearch(self.acquisition_func,
                                        self.config_space)
        self.incumbent = None

        if tae_runner is None:
            self.executor = ExecuteTARunOld(ta=scenario.ta,
                                            stats=self.stats,
                                            run_obj=scenario.run_obj,
                                            par_factor=scenario.par_factor)
        else:
            self.executor = tae_runner

        self.inten = Intensifier(
            executor=self.executor,
            stats=self.stats,
            traj_logger=self.traj_logger,
            instances=self.scenario.train_insts,
            cutoff=self.scenario.cutoff,
            deterministic=self.scenario.deterministic,
            run_obj_time=self.scenario.run_obj == "runtime",
            instance_specifics=self.scenario.instance_specific)

        num_params = len(self.config_space.get_hyperparameters())

        self.objective = average_cost
        if self.scenario.run_obj == "runtime":

            if runhistory2epm is None:
                # if we log the performance data,
                # the RFRImputator will already get
                # log transform data from the runhistory
                cutoff = np.log10(self.scenario.cutoff)
                threshold = np.log10(self.scenario.cutoff *
                                     self.scenario.par_factor)

                imputor = RFRImputator(cs=self.config_space,
                                       rs=self.rng,
                                       cutoff=cutoff,
                                       threshold=threshold,
                                       model=self.model,
                                       change_threshold=0.01,
                                       max_iter=10)
                self.rh2EPM = RunHistory2EPM4LogCost(scenario=self.scenario,
                                                     num_params=num_params,
                                                     success_states=[
                                                         StatusType.SUCCESS,
                                                     ],
                                                     impute_censored_data=True,
                                                     impute_state=[
                                                         StatusType.TIMEOUT,
                                                     ],
                                                     imputor=imputor)
            else:
                self.rh2EPM = runhistory2epm

        elif self.scenario.run_obj == 'quality':
            if runhistory2epm is None:
                self.rh2EPM = RunHistory2EPM4Cost\
                    (scenario=self.scenario, num_params=num_params,
                     success_states=[StatusType.SUCCESS, ],
                     impute_censored_data=False, impute_state=None)
            else:
                self.rh2EPM = runhistory2epm

        else:
            raise ValueError('Unknown run objective: %s. Should be either '
                             'quality or runtime.' % self.scenario.run_obj)

    def run_initial_design(self):
        '''
            runs algorithm runs for a initial design;
            default implementation: running the default configuration on
                                    a random instance-seed pair
            Side effect: adds runs to self.runhistory

            Returns
            -------
            incumbent: Configuration()
                initial incumbent configuration
        '''

        default_conf = self.config_space.get_default_configuration()
        self.incumbent = default_conf

        # add this incumbent right away to have an entry to time point 0
        self.traj_logger.add_entry(train_perf=2**31,
                                   incumbent_id=1,
                                   incumbent=self.incumbent)

        rand_inst_id = self.rng.randint(0, len(self.scenario.train_insts))
        # ignore instance specific values
        rand_inst = self.scenario.train_insts[rand_inst_id]

        if self.scenario.deterministic:
            initial_seed = 0
        else:
            initial_seed = random.randint(0, MAXINT)

        status, cost, runtime, additional_info = self.executor.start(
            default_conf,
            instance=rand_inst,
            cutoff=self.scenario.cutoff,
            seed=initial_seed,
            instance_specific=self.scenario.instance_specific.get(
                rand_inst, "0"))

        if status in [StatusType.CRASHED or StatusType.ABORT]:
            self.logger.critical("First run crashed -- Abort")
            sys.exit(1)

        self.runhistory.add(config=default_conf,
                            cost=cost,
                            time=runtime,
                            status=status,
                            instance_id=rand_inst,
                            seed=initial_seed,
                            additional_info=additional_info)
        defaul_inst_seeds = set(
            self.runhistory.get_runs_for_config(default_conf))
        default_perf = self.objective(default_conf, self.runhistory,
                                      defaul_inst_seeds)
        self.runhistory.update_cost(default_conf, default_perf)

        self.stats.inc_changed += 1  # first incumbent

        self.traj_logger.add_entry(train_perf=default_perf,
                                   incumbent_id=self.stats.inc_changed,
                                   incumbent=self.incumbent)

        return default_conf

    def run(self, max_iters=10):
        '''
        Runs the Bayesian optimization loop for max_iters iterations

        Parameters
        ----------
        max_iters: int
            The maximum number of iterations

        Returns
        ----------
        incumbent: np.array(1, H)
            The best found configuration
        '''
        self.stats.start_timing()

        #self.runhistory = RunHisory()

        self.incumbent = self.run_initial_design()

        # Main BO loop
        iteration = 1
        while True:
            if self.scenario.shared_model:
                pSMAC.read(run_history=self.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            start_time = time.time()
            X, Y = self.rh2EPM.transform(self.runhistory)

            self.logger.debug("Search for next configuration")
            # get all found configurations sorted according to acq
            challengers = self.choose_next(X, Y)

            time_spend = time.time() - start_time
            logging.debug(
                "Time spend to choose next configurations: %.2f sec" %
                (time_spend))

            self.logger.debug("Intensify")

            self.incumbent, inc_perf = self.inten.intensify(
                challengers=challengers,
                incumbent=self.incumbent,
                run_history=self.runhistory,
                objective=self.objective,
                time_bound=max(0.01, time_spend))

            # TODO: Write run history into database
            if self.scenario.shared_model:
                pSMAC.write(run_history=self.runhistory,
                            output_directory=self.scenario.output_dir,
                            num_run=self.num_run)

            if iteration == max_iters:
                break

            iteration += 1

            logging.debug(
                "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)"
                % (self.stats.get_remaing_time_budget(),
                   self.stats.get_remaining_ta_budget(),
                   self.stats.get_remaining_ta_runs()))

            if self.stats.is_budget_exhausted():
                break

            self.stats.print_stats(debug_out=True)

        return self.incumbent

    def choose_next(self,
                    X,
                    Y,
                    num_interleaved_random=1010,
                    num_configurations_by_random_search_sorted=1000,
                    num_configurations_by_local_search=10):
        """Choose next candidate solution with Bayesian optimization.

        Parameters
        ----------
        X : (N, D) numpy array
            Each row contains a configuration and one set of
            instance features.
        Y : (N, O) numpy array
            The function values for each configuration instance pair.

        Returns
        -------
        list
            List of 2020 suggested configurations to evaluate.
        """
        self.model.train(X, Y)

        if self.runhistory.empty():
            incumbent_value = 0.0
        elif self.incumbent is None:
            # TODO try to calculate an incumbent from the runhistory!
            incumbent_value = 0.0
        else:
            incumbent_value = self.runhistory.get_cost(self.incumbent)

        self.acquisition_func.update(model=self.model, eta=incumbent_value)

        # Remove dummy acquisition function value
        next_configs_by_random_search = [
            x[1] for x in self._get_next_by_random_search(
                num_points=num_interleaved_random)
        ]

        # Get configurations sorted by EI
        next_configs_by_random_search_sorted = \
            self._get_next_by_random_search(
                num_configurations_by_random_search_sorted, _sorted=True)
        next_configs_by_local_search = \
            self._get_next_by_local_search(num_configurations_by_local_search)

        next_configs_by_acq_value = next_configs_by_random_search_sorted + \
            next_configs_by_local_search
        next_configs_by_acq_value.sort(reverse=True, key=lambda x: x[0])
        self.logger.debug(
            "First 10 acq func values of selected configurations: %s" %
            (str([_[0] for _ in next_configs_by_acq_value[:10]])))
        next_configs_by_acq_value = [_[1] for _ in next_configs_by_acq_value]

        challengers = list(
            itertools.chain(*zip(next_configs_by_acq_value,
                                 next_configs_by_random_search)))
        return challengers

    def _get_next_by_random_search(self, num_points=1000, _sorted=False):
        """Get candidate solutions via local search.

        Parameters
        ----------
        num_points : int, optional (default=10)
            Number of local searches and returned values.

        _sorted : bool, optional (default=True)
            Whether to sort the candidate solutions by acquisition function
            value.

        Returns
        -------
        list : (acquisition value, Candidate solutions)
        """

        rand_configs = self.config_space.sample_configuration(size=num_points)
        if _sorted:
            imputed_rand_configs = map(ConfigSpace.util.impute_inactive_values,
                                       rand_configs)
            imputed_rand_configs = [
                x.get_array() for x in imputed_rand_configs
            ]
            imputed_rand_configs = np.array(imputed_rand_configs,
                                            dtype=np.float64)
            acq_values = self.acquisition_func(imputed_rand_configs)
            # From here
            # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values
            random = self.rng.rand(len(acq_values))
            # Last column is primary sort key!
            indices = np.lexsort((random.flatten(), acq_values.flatten()))

            for i in range(len(rand_configs)):
                rand_configs[i].origin = 'Random Search (sorted)'

            # Cannot use zip here because the indices array cannot index the
            # rand_configs list, because the second is a pure python list
            return [(acq_values[ind][0], rand_configs[ind])
                    for ind in indices[::-1]]
        else:
            for i in range(len(rand_configs)):
                rand_configs[i].origin = 'Random Search'
            return [(0, rand_configs[i]) for i in range(len(rand_configs))]

    def _get_next_by_local_search(self, num_points=10):
        """Get candidate solutions via local search.

        In case acquisition function values tie, these will be broken randomly.

        Parameters
        ----------
        num_points : int, optional (default=10)
            Number of local searches and returned values.

        Returns
        -------
        list : (acquisition value, Candidate solutions),
               ordered by their acquisition function value
        """
        configs_acq = []

        # Start N local search from different random start points
        for i in range(num_points):
            if i == 0 and self.incumbent is not None:
                start_point = self.incumbent
            else:
                start_point = self.config_space.sample_configuration()

            configuration, acq_val = self.local_search.maximize(start_point)

            configuration.origin = 'Local Search'
            configs_acq.append((acq_val[0][0], configuration))

        # shuffle for random tie-break
        random.shuffle(configs_acq, self.rng.rand)

        # sort according to acq value
        # and return n best configurations
        configs_acq.sort(reverse=True, key=lambda x: x[0])

        return configs_acq
Esempio n. 20
0
    def get_pred_surface(self, X_scaled, conf_list: list):
        '''
            fit epm on the scaled input dimension and
            return data to plot a contour plot

            Parameters
            ----------
            X_scaled: np.array
                configurations in scaled 2dim
            conf_list: list
                list of Configuration objects

            Returns
            -------
            np.array, np.array, np.array
                x,y,Z for contour plots

        '''

        # use PCA to reduce features to also at most 2 dims
        n_feats = self.scenario.feature_array.shape[1]
        if n_feats > 2:
            self.logger.debug("Use PCA to reduce features to 2dim")
            insts = self.scenario.feature_dict.keys()
            feature_array = np.array([self.scenario.feature_dict[inst] for inst in insts])
            ss = StandardScaler()
            self.scenario.feature_array = ss.fit_transform(feature_array)
            pca = PCA(n_components=2)
            feature_array = pca.fit_transform(feature_array)
            n_feats = feature_array.shape[1]
            self.scenario.feature_array = feature_array
            self.scenario.feature_dict = dict([(inst, feature_array[idx,:]) for idx, inst in enumerate(insts)])
            self.scenario.n_features = 2

        # Create new rh with only wanted configs
        new_rh = RunHistory(average_cost)
        for rh in self.runhistories:
            for key, value in rh.data.items():
                config = rh.ids_config[key.config_id]
                if config in self.configs_to_plot:
                    config_id, instance, seed = key
                    cost, time, status, additional_info = value
                    new_rh.add(config, cost, time, status, instance_id=instance,
                               seed=seed, additional_info=additional_info)
        self.relevant_rh = new_rh

        X, y, types = convert_data(scenario=self.scenario,
                                   runhistory=new_rh)

        types = np.array(np.zeros((2+n_feats)), dtype=np.uint)

        num_params = len(self.scenario.cs.get_hyperparameters())

        # impute missing values in configs
        conf_dict = {}
        for idx, c in enumerate(conf_list):
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        X_trans = []
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            x_new = np.concatenate(
                        (x_scaled_conf, x[num_params:]), axis=0)
            X_trans.append(x_new)
        X_trans = np.array(X_trans)

        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(types=types, bounds=bounds,
                                          instance_features=np.array(self.scenario.feature_array),
                                          ratio_features=1.0)

        model.train(X_trans, y)

        self.logger.debug("RF fitted")

        plot_step = self.contour_step_size

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f" %(x_min, x_max, y_min, y_max))

        self.logger.debug("Predict on %d samples in grid to get surface" %(np.c_[xx.ravel(), yy.ravel()].shape[0]))
        Z, _ = model.predict_marginalized_over_instances(
            np.c_[xx.ravel(), yy.ravel()])

        Z = Z.reshape(xx.shape)

        return xx, yy, Z
Esempio n. 21
0
    def plot_cost_over_time(self,
                            rh,
                            traj,
                            output="performance_over_time.png",
                            validator=None):
        """ Plot performance over time, using all trajectory entries
            with max_time = wallclock_limit or (if inf) the highest
            recorded time

            Parameters
            ----------
            rh: RunHistory
                runhistory to use
            traj: List
                trajectory to take times/incumbents from
            output: str
                path to output-png
            epm: RandomForestWithInstances
                emperical performance model (expecting trained on all runs)
        """
        self.logger.debug("Estimating costs over time for best run.")
        validator.traj = traj  # set trajectory
        time, configs = [], []

        for entry in traj:
            time.append(entry["wallclock_time"])
            configs.append(entry["incumbent"])

        self.logger.debug("Using %d samples (%d distinct) from trajectory.",
                          len(time), len(set(configs)))

        if validator.epm:  # not log as validator epm is trained on cost, not log cost
            epm = validator.epm
        else:
            self.logger.debug(
                "No EPM passed! Training new one from runhistory.")
            # Train random forest and transform training data (from given rh)
            # Not using validator because we want to plot uncertainties
            rh2epm = RunHistory2EPM4Cost(num_params=len(
                self.scenario.cs.get_hyperparameters()),
                                         scenario=self.scenario)
            X, y = rh2epm.transform(rh)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))

            types, bounds = get_types(self.scenario.cs,
                                      self.scenario.feature_array)
            epm = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=self.scenario.feature_array,
                #seed=self.rng.randint(MAXINT),
                ratio_features=1.0)
            epm.train(X, y)

        ## not necessary right now since the EPM only knows the features
        ## of the training instances
        # use only training instances
        #=======================================================================
        # if self.scenario.feature_dict:
        #     feat_array = []
        #     for inst in self.scenario.train_insts:
        #         feat_array.append(self.scenario.feature_dict[inst])
        #     backup_features_epm = epm.instance_features
        #     epm.instance_features = np.array(feat_array)
        #=======================================================================

        # predict performance for all configurations in trajectory
        config_array = convert_configurations_to_array(configs)
        mean, var = epm.predict_marginalized_over_instances(config_array)

        #=======================================================================
        # # restore feature array in epm
        # if self.scenario.feature_dict:
        #     epm.instance_features = backup_features_epm
        #=======================================================================

        mean = mean[:, 0]
        var = var[:, 0]
        uncertainty_upper = mean + np.sqrt(var)
        uncertainty_lower = mean - np.sqrt(var)
        if self.scenario.run_obj == 'runtime':  # We have to clip at 0 as we want to put y on the logscale
            uncertainty_lower[uncertainty_lower < 0] = 0
            uncertainty_upper[uncertainty_upper < 0] = 0

        # plot
        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax.set_ylabel('performance')
        ax.set_xlabel('time [sec]')
        ax.plot(time, mean, 'r-', label="estimated performance")
        ax.fill_between(time,
                        uncertainty_upper,
                        uncertainty_lower,
                        alpha=0.8,
                        label="standard deviation")
        ax.set_xscale("log", nonposx='clip')
        if self.scenario.run_obj == 'runtime':
            ax.set_yscale('log')

        # ax.set_ylim(min(mean)*0.8, max(mean)*1.2)
        # start after 1% of the configuration budget
        ax.set_xlim(min(time) + (max(time) - min(time)) * 0.01, max(time))

        ax.legend()
        plt.tight_layout()
        fig.savefig(output)
        plt.close(fig)
Esempio n. 22
0
class FeatureForwardSelector():
    """ Inspired by forward selection of ParameterImportance-package. """
    def __init__(self, scenario, runhistory, to_evaluate: int = 3, rng=None):
        """
        Constructor
        :parameter:
        scenario
            SMAC scenario object
        to_evaluate
            int. Indicates for how many parameters the Importance values have to be computed
        """
        self.logger = logging.getLogger(self.__module__ + '.' +
                                        self.__class__.__name__)
        self.rng = rng
        if rng is None:
            self.rng = np.random.RandomState(42)

        self.scenario = copy.deepcopy(scenario)
        self.cs = scenario.cs
        self.rh = runhistory
        self.to_evaluate = to_evaluate

        self.MAX_SAMPLES = 100000

        self.model = None

    def run(self):
        """
        Implementation of the forward selection loop.
        Uses SMACs EPM (RF) wrt the feature space to minimize the OOB error.

        Returns
        -------
        feature_importance: OrderedDict
            dict_keys (first key -> most important) -> OOB error
        """
        parameters = [p.name for p in self.scenario.cs.get_hyperparameters()]
        self.logger.debug("Parameters: %s", parameters)

        rh2epm = RunHistory2EPM4Cost(scenario=self.scenario,
                                     num_params=len(parameters),
                                     success_states=[
                                         StatusType.SUCCESS, StatusType.CAPPED,
                                         StatusType.CRASHED
                                     ],
                                     impute_censored_data=False,
                                     impute_state=None)

        X, y = rh2epm.transform(self.rh)

        # reduce sample size to speedup computation
        if X.shape[0] > self.MAX_SAMPLES:
            idx = np.random.choice(X.shape[0],
                                   size=self.MAX_SAMPLES,
                                   replace=False)
            X = X[idx, :]
            y = y[idx]

        self.logger.debug(
            "Shape of X: %s, of y: %s, #parameters: %s, #feats: %s", X.shape,
            y.shape, len(parameters), len(self.scenario.feature_names))
        names = copy.deepcopy(self.scenario.feature_names)
        self.logger.debug("Features: %s", names)

        used = list(range(0, len(parameters)))
        feat_ids = {f: i for i, f in enumerate(names, len(used))}
        ids_feat = {i: f for f, i in feat_ids.items()}
        self.logger.debug("Used: %s", used)
        evaluated_feature_importance = OrderedDict()

        types, bounds = get_types(self.scenario.cs,
                                  self.scenario.feature_array)

        last_error = np.inf

        for _round in range(self.to_evaluate):  # Main Loop
            errors = []
            for f in names:
                i = feat_ids[f]
                self.logger.debug('Evaluating %s', f)
                used.append(i)
                self.logger.debug(
                    'Used features: %s',
                    str([ids_feat[j] for j in used[len(parameters):]]))

                start = time.time()
                self._refit_model(types[sorted(used)], bounds, X[:,
                                                                 sorted(used)],
                                  y)  # refit the model every round
                errors.append(self.model.rf.out_of_bag_error())
                used.pop()
                self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' %
                                  (time.time() - start, errors[-1]))
            else:
                self.logger.debug('Evaluating None')
                start = time.time()
                self._refit_model(types[sorted(used)], bounds, X[:,
                                                                 sorted(used)],
                                  y)  # refit the model every round
                errors.append(self.model.rf.out_of_bag_error())
                self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' %
                                  (time.time() - start, errors[-1]))
                if _round == 0:
                    evaluated_feature_importance['None'] = errors[-1]
            best_idx = np.argmin(errors)
            lowest_error = errors[best_idx]

            if best_idx == len(errors) - 1:
                self.logger.info('Best thing to do is add nothing')
                best_feature = 'None'
                # evaluated_feature_importance[best_feature] = lowest_error
                break
            elif lowest_error >= last_error:
                break
            else:
                last_error = lowest_error
                best_feature = names.pop(best_idx)
                used.append(feat_ids[best_feature])

            self.logger.debug('%s: %.4f' % (best_feature, lowest_error))
            evaluated_feature_importance[best_feature] = lowest_error

        self.logger.debug(evaluated_feature_importance)
        self.evaluated_feature_importance = evaluated_feature_importance
        return evaluated_feature_importance

    def _refit_model(self, types, bounds, X, y):
        """
        Easily allows for refitting of the model.

        Parameters
        ----------
        types: list
            SMAC EPM types
        X:ndarray
            X matrix
        y:ndarray
            corresponding y vector
        """
        # take at most 80% of the data per split to ensure enough data for oob error
        self.model = RandomForestWithInstances(self.cs,
                                               types=types,
                                               bounds=bounds,
                                               seed=self.rng.randint(MAXINT),
                                               do_bootstrapping=True,
                                               n_points_per_tree=int(
                                                   X.shape[1] * 0.8))
        self.model.rf_opts.compute_oob_error = True
        self.model.train(X, y)

    def _plot_result(self, output_fn, bar=True):
        """
            plot oob score as bar charts
            Parameters
            ----------
            name
                file name to save plot
        """

        fig, ax = plt.subplots()
        features = list(self.evaluated_feature_importance.keys())
        errors = list(self.evaluated_feature_importance.values())
        max_to_plot = min(len(errors), 5)

        ind = np.arange(len(errors))
        if bar:
            ax.bar(ind, errors, color=(0.25, 0.25, 0.45))
        else:
            ax.plot(ind, errors, lw=4, color=(0.125, 0.125, 0.125))

        ax.set_ylabel('error', size='24', family='sans-serif')
        if bar:
            ax.set_xticks(ind)
            ax.set_xlim(-.5, max_to_plot - 0.5)
        else:
            ax.set_xticks(ind)
            ax.set_xlim(0, max_to_plot - 1)
        ax.set_xticklabels(features,
                           rotation=30,
                           ha='right',
                           size='10',
                           family='monospace')
        ax.xaxis.grid(True)
        ax.yaxis.grid(True)

        plt.tight_layout()

        out_dir = os.path.dirname(output_fn)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        fig.savefig(output_fn)
        return output_fn

    def plot_result(self, output_fn=None):
        plot_paths = []
        plot_paths.append(self._plot_result(output_fn + '-barplot.png', True))
        plot_paths.append(self._plot_result(output_fn + '-chng.png', False))
        plt.close('all')
        self.logger.debug('Saved plot as %s-[barplot|chng].png' % output_fn)
        return plot_paths
Esempio n. 23
0
class Validator(object):
    """
    Validator for the output of SMAC-scenarios.
    Evaluates specified configurations on specified instances.
    """
    def __init__(self,
                 scenario: Scenario,
                 trajectory: list,
                 rng: Union[np.random.RandomState, int] = None):
        """
        Construct Validator for given scenario and trajectory.

        Parameters
        ----------
        scenario: Scenario
            scenario object for cutoff, instances, features and specifics
        trajectory: trajectory-list
            trajectory to take incumbent(s) from
        rng: np.random.RandomState or int
            Random number generator or seed
        """
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        self.traj = trajectory
        self.scen = scenario
        self.epm = None

        if isinstance(rng, np.random.RandomState):
            self.rng = rng
        elif isinstance(rng, int):
            self.rng = np.random.RandomState(seed=rng)
        else:
            self.logger.debug('no seed given, using default seed of 1')
            num_run = 1
            self.rng = np.random.RandomState(seed=num_run)

    def _save_results(self, rh: RunHistory, output_fn, backup_fn=None):
        """ Helper to save results to file

        Parameters
        ----------
        rh: RunHistory
            runhistory to save
        output_fn: str
            if ends on '.json': filename to save history to
            else: directory to save runhistory to (filename is backup_fn)
        backup_fn: str
            if output_fn does not end on '.json', treat output_fn as dir and
            append backup_fn as filename (if output_fn ends on '.json', this
            argument is ignored)
        """
        if output_fn == "":
            self.logger.info(
                "No output specified, validated runhistory not saved.")
            return
        # Check if a folder or a file is specified as output
        if not output_fn.endswith('.json'):
            output_dir = output_fn
            output_fn = os.path.join(output_dir, backup_fn)
            self.logger.debug("Output is \"%s\", changing to \"%s\"!",
                              output_dir, output_fn)
        base = os.path.split(output_fn)[0]
        if not base == "" and not os.path.exists(base):
            self.logger.debug("Folder (\"%s\") doesn't exist, creating.", base)
            os.makedirs(base)
        rh.save_json(output_fn)
        self.logger.info("Saving validation-results in %s", output_fn)

    def validate(
        self,
        config_mode: Union[str, typing.List[Configuration]] = 'def',
        instance_mode: Union[str, typing.List[str]] = 'test',
        repetitions: int = 1,
        n_jobs: int = 1,
        backend: str = 'threading',
        runhistory: RunHistory = None,
        tae: ExecuteTARun = None,
        output_fn: str = "",
    ) -> RunHistory:
        """
        Validate configs on instances and save result in runhistory.
        If a runhistory is provided as input it is important that you run it on the same/comparable hardware.

        side effect: if output is specified, saves runhistory to specified
        output directory.

        Parameters
        ----------
        config_mode: str or list<Configuration>
            string or directly a list of Configuration.
            string from [def, inc, def+inc, wallclock_time, cpu_time, all].
            time evaluates at cpu- or wallclock-timesteps of:
            [max_time/2^0, max_time/2^1, max_time/2^3, ..., default]
            with max_time being the highest recorded time
        instance_mode: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of repetitions in nondeterministic algorithms
        n_jobs: int
            number of parallel processes used by joblib
        backend: str
            what backend joblib should use for parallel runs
        runhistory: RunHistory
            optional, RunHistory-object to reuse runs
        tae: ExecuteTARun
            tae to be used. if None, will initialize ExecuteTARunOld
        output_fn: str
            path to runhistory to be saved. if the suffix is not '.json', will
            be interpreted as directory and filename will be
            'validated_runhistory.json'

        Returns
        -------
        runhistory: RunHistory
            runhistory with validated runs
        """
        self.logger.debug(
            "Validating configs '%s' on instances '%s', repeating %d times"
            " with %d parallel runs on backend '%s'.", config_mode,
            instance_mode, repetitions, n_jobs, backend)

        # Get all runs to be evaluated as list
        runs, validated_rh = self._get_runs(config_mode, instance_mode,
                                            repetitions, runhistory)

        # Create new Stats without limits
        inf_scen = Scenario({
            'run_obj': self.scen.run_obj,
            'cutoff_time': self.scen.cutoff,
            'output_dir': ""
        })
        inf_stats = Stats(inf_scen)
        inf_stats.start_timing()

        # Create TAE
        if not tae:
            tae = ExecuteTARunOld(ta=self.scen.ta,
                                  stats=inf_stats,
                                  run_obj=self.scen.run_obj,
                                  par_factor=self.scen.par_factor,
                                  cost_for_crash=self.scen.cost_for_crash)
        else:
            # Inject endless-stats
            tae.stats = inf_stats

        # Validate!
        run_results = self._validate_parallel(tae, runs, n_jobs, backend)

        # tae returns (status, cost, runtime, additional_info)
        # Add runs to RunHistory
        idx = 0
        for result in run_results:
            validated_rh.add(config=runs[idx].config,
                             cost=result[1],
                             time=result[2],
                             status=result[0],
                             instance_id=runs[idx].inst,
                             seed=runs[idx].seed,
                             additional_info=result[3])
            idx += 1

        if output_fn:
            self._save_results(validated_rh,
                               output_fn,
                               backup_fn="validated_runhistory.json")
        return validated_rh

    def _validate_parallel(self, tae: ExecuteTARun, runs: typing.List[_Run],
                           n_jobs: int, backend: str):
        """
        Validate runs with joblibs Parallel-interface

        Parameters
        ----------
        tae: ExecuteTARun
            tae to be used for validation
        runs: list<_Run>
            list with _Run-objects
            [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1), ...]
        n_jobs: int
            number of cpus to use for validation (-1 to use all)
        backend: str
            what backend to use for parallelization

        Returns
        -------
        run_results: list<tuple(tae-returns)>
            results as returned by tae
        """
        # Runs with parallel
        run_results = Parallel(n_jobs=n_jobs, backend=backend)(
            delayed(_unbound_tae_starter)(tae,
                                          run.config,
                                          run.inst,
                                          self.scen.cutoff,
                                          run.seed,
                                          run.inst_specs,
                                          capped=False) for run in runs)
        return run_results

    def validate_epm(
        self,
        config_mode: Union[str, typing.List[Configuration]] = 'def',
        instance_mode: Union[str, typing.List[str]] = 'test',
        repetitions: int = 1,
        runhistory: RunHistory = None,
        output_fn="",
        reuse_epm=True,
    ) -> RunHistory:
        """
        Use EPM to predict costs/runtimes for unknown config/inst-pairs.

        side effect: if output is specified, saves runhistory to specified
        output directory.

        Parameters
        ----------
        output_fn: str
            path to runhistory to be saved. if the suffix is not '.json', will
            be interpreted as directory and filename will be
            'validated_runhistory_EPM.json'
        config_mode: str or list<Configuration>
            string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all].
            time evaluates at cpu- or wallclock-timesteps of:
            [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time
        instance_mode: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of repetitions in nondeterministic algorithms
        runhistory: RunHistory
            optional, RunHistory-object to reuse runs
        reuse_epm: bool
            if true (and if `self.epm`), reuse epm to validate runs

        Returns
        -------
        runhistory: RunHistory
            runhistory with predicted runs
        """
        if not isinstance(runhistory, RunHistory) and (self.epm is None
                                                       or reuse_epm is False):
            raise ValueError(
                "No runhistory specified for validating with EPM!")
        elif reuse_epm is False or self.epm is None:
            # Create RandomForest
            types, bounds = get_types(self.scen.cs, self.scen.feature_array)
            self.epm = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=self.scen.feature_array,
                seed=self.rng.randint(MAXINT),
                ratio_features=1.0)
            # Use imputor if objective is runtime
            imputor = None
            impute_state = None
            impute_censored_data = False
            if self.scen.run_obj == 'runtime':
                threshold = self.scen.cutoff * self.scen.par_factor
                imputor = RFRImputator(rng=self.rng,
                                       cutoff=self.scen.cutoff,
                                       threshold=threshold,
                                       model=self.epm)
                impute_censored_data = True
                impute_state = [StatusType.CAPPED]
            # Transform training data (from given rh)
            rh2epm = RunHistory2EPM4Cost(
                num_params=len(self.scen.cs.get_hyperparameters()),
                scenario=self.scen,
                rng=self.rng,
                impute_censored_data=impute_censored_data,
                imputor=imputor,
                impute_state=impute_state)
            X, y = rh2epm.transform(runhistory)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))
            # Train random forest
            self.epm.train(X, y)

        # Predict desired runs
        runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions,
                                      runhistory)

        feature_array_size = len(self.scen.cs.get_hyperparameters())
        if self.scen.feature_array is not None:
            feature_array_size += self.scen.feature_array.shape[1]

        X_pred = np.empty((len(runs), feature_array_size))
        for idx, run in enumerate(runs):
            if self.scen.feature_array is not None and run.inst is not None:
                X_pred[idx] = np.hstack([
                    convert_configurations_to_array([run.config])[0],
                    self.scen.feature_dict[run.inst]
                ])
            else:
                X_pred[idx] = convert_configurations_to_array([run.config])[0]
        self.logger.debug("Predicting desired %d runs, data has shape %s",
                          len(runs), str(X_pred.shape))

        y_pred = self.epm.predict(X_pred)

        # Add runs to runhistory
        for run, pred in zip(runs, y_pred[0]):
            rh_epm.add(
                config=run.config,
                cost=float(pred),
                time=float(pred),
                status=StatusType.SUCCESS,
                instance_id=run.inst,
                seed=-1,
                additional_info={"additional_info": "ESTIMATED USING EPM!"})

        if output_fn:
            self._save_results(rh_epm,
                               output_fn,
                               backup_fn="validated_runhistory_EPM.json")
        return rh_epm

    def _get_runs(
        self,
        configs: Union[str, typing.List[Configuration]],
        insts: Union[str, typing.List[str]],
        repetitions: int = 1,
        runhistory: RunHistory = None,
    ) -> typing.Tuple[typing.List[_Run], RunHistory]:
        """
        Generate list of SMAC-TAE runs to be executed. This means
        combinations of configs with all instances on a certain number of seeds.

        side effect: Adds runs that don't need to be reevaluated to self.rh!

        Parameters
        ----------
        configs: str or list<Configuration>
            string or directly a list of Configuration
            str from [def, inc, def+inc, wallclock_time, cpu_time, all]
                time evaluates at cpu- or wallclock-timesteps of:
                [max_time/2^0, max_time/2^1, max_time/2^3, ..., default]
                with max_time being the highest recorded time
        insts: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of seeds per instance/config-pair to be evaluated
        runhistory: RunHistory
            optional, try to reuse this runhistory and save some runs

        Returns
        -------
        runs: list<_Run>
            list with _Runs
            [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1),
             _Run(config=CONFIG2,inst=INSTANCE2,seed=SEED2,inst_specs=INST_SPECIFICS2),
             ...]
        """
        # Get relevant configurations and instances
        if isinstance(configs, str):
            configs = self._get_configs(configs)
        if isinstance(insts, str):
            insts = self._get_instances(insts)

        # If no instances are given, fix the instances to one "None" instance
        if not insts:
            insts = [None]
        # If algorithm is deterministic, fix repetitions to 1
        if self.scen.deterministic and repetitions != 1:
            self.logger.warning(
                "Specified %d repetitions, but fixing to 1, "
                "because algorithm is deterministic.", repetitions)
            repetitions = 1

        # Extract relevant information from given runhistory
        inst_seed_config = self._process_runhistory(configs, insts, runhistory)

        # Now create the actual run-list
        runs = []
        # Counter for runs without the need of recalculation
        runs_from_rh = 0
        # If we reuse runs, we want to return them as well
        new_rh = RunHistory(average_cost)

        for i in sorted(insts):
            for rep in range(repetitions):
                # First, find a seed and add all the data we can take from the
                # given runhistory to "our" validation runhistory.
                configs_evaluated = []
                if runhistory and i in inst_seed_config:
                    # Choose seed based on most often evaluated inst-seed-pair
                    seed, configs_evaluated = inst_seed_config[i].pop(0)
                    # Delete inst if all seeds are used
                    if not inst_seed_config[i]:
                        inst_seed_config.pop(i)
                    # Add runs to runhistory
                    for c in configs_evaluated[:]:
                        runkey = RunKey(runhistory.config_ids[c], i, seed)
                        cost, time, status, additional_info = runhistory.data[
                            runkey]
                        if status in [
                                StatusType.CRASHED, StatusType.ABORT,
                                StatusType.CAPPED
                        ]:
                            # Not properly executed target algorithm runs should be repeated
                            configs_evaluated.remove(c)
                            continue
                        new_rh.add(c,
                                   cost,
                                   time,
                                   status,
                                   instance_id=i,
                                   seed=seed,
                                   additional_info=additional_info)
                        runs_from_rh += 1
                else:
                    # If no runhistory or no entries for instance, get new seed
                    seed = self.rng.randint(MAXINT)

                # We now have a seed and add all configs that are not already
                # evaluated on that seed to the runs-list. This way, we
                # guarantee the same inst-seed-pairs for all configs.
                for config in [
                        c for c in configs if not c in configs_evaluated
                ]:
                    # Only use specifics if specific exists, else use string "0"
                    specs = self.scen.instance_specific[
                        i] if i and i in self.scen.instance_specific else "0"
                    runs.append(
                        _Run(config=config,
                             inst=i,
                             seed=seed,
                             inst_specs=specs))

        self.logger.info(
            "Collected %d runs from %d configurations on %d "
            "instances with %d repetitions. Reusing %d runs from "
            "given runhistory.", len(runs), len(configs), len(insts),
            repetitions, runs_from_rh)

        return runs, new_rh

    def _process_runhistory(self, configs: typing.List[Configuration],
                            insts: typing.List[str], runhistory: RunHistory):
        """
        Processes runhistory from self._get_runs by extracting already evaluated
        (relevant) config-inst-seed tuples.

        Parameters
        ----------
        configs: list(Configuration)
            list of configs of interest
        insts: list(str)
            list of instances of interest
        runhistory: RunHistory
            runhistory to extract runs from

        Returns
        -------
        inst_seed_config: dict<str : list(tuple(int, tuple(configs)))>
            dictionary mapping instances to a list of tuples of already used
            seeds and the configs that this inst-seed-pair has been evaluated
            on, sorted by the number of configs
        """
        # We want to reuse seeds that have been used on most configurations
        # To this end, we create a dictionary as {instances:{seed:[configs]}}
        # Like this we can easily retrieve the most used instance-seed pairs to
        # minimize the number of runs to be evaluated
        inst_seed_config = {}
        if runhistory:
            relevant = dict()
            for key in runhistory.data:
                if (runhistory.ids_config[key.config_id] in configs
                        and key.instance_id in insts):
                    relevant[key] = runhistory.data[key]

            # Change data-structure to {instances:[(seed1, (configs)), (seed2, (configs), ... ]}
            # to make most used seed easily accessible, we sort after length of configs
            for key in relevant:
                inst, seed = key.instance_id, key.seed
                config = runhistory.ids_config[key.config_id]
                if inst in inst_seed_config:
                    if seed in inst_seed_config[inst]:
                        inst_seed_config[inst][seed].append(config)
                    else:
                        inst_seed_config[inst][seed] = [config]
                else:
                    inst_seed_config[inst] = {seed: [config]}

            inst_seed_config = {
                i: sorted([(seed, list(inst_seed_config[i][seed]))
                           for seed in inst_seed_config[i]],
                          key=lambda x: len(x[1]))
                for i in inst_seed_config
            }
        return inst_seed_config

    def _get_configs(self, mode: str) -> typing.List[str]:
        """
        Return desired configs

        Parameters
        ----------
        mode: str
            str from [def, inc, def+inc, wallclock_time, cpu_time, all]
                time evaluates at cpu- or wallclock-timesteps of:
                [max_time/2^0, max_time/2^1, max_time/2^3, ..., default]
                with max_time being the highest recorded time

        Returns
        -------
        configs: list<Configuration>
            list with desired configurations
        """
        # Add desired configs
        configs = []
        mode = mode.lower()
        if mode not in [
                'def', 'inc', 'def+inc', 'wallclock_time', 'cpu_time', 'all'
        ]:
            raise ValueError(
                "%s not a valid option for config_mode in validation." % mode)
        if mode == "def" or mode == "def+inc":
            configs.append(self.scen.cs.get_default_configuration())
        if mode == "inc" or mode == "def+inc":
            configs.append(self.traj[-1]["incumbent"])
        if mode in ["wallclock_time", "cpu_time"]:
            # get highest time-entry and add entries from there
            # not using wallclock_limit in case it's inf
            if (mode == "wallclock_time"
                    and np.isfinite(self.scen.wallclock_limit)):
                max_time = self.scen.wallclock_limit
            elif (mode == "cpu_time"
                  and np.isfinite(self.scen.algo_runs_timelimit)):
                max_time = self.scen.algo_runs_timelimit
            else:
                max_time = self.traj[-1][mode]
            counter = 2**0
            for entry in self.traj[::-1]:
                if (entry[mode] <= max_time / counter
                        and entry["incumbent"] not in configs):
                    configs.append(entry["incumbent"])
                    counter *= 2
            if not self.traj[0]["incumbent"] in configs:
                configs.append(traj[0]["incumbent"])  # add first
        if mode == "all":
            for entry in self.traj:
                if not entry["incumbent"] in configs:
                    configs.append(entry["incumbent"])
        self.logger.debug("Gathered %d configurations for mode %s.",
                          len(configs), mode)
        return configs

    def _get_instances(self, mode: str) -> typing.List[str]:
        """
        Get desired instances

        Parameters
        ----------
        mode: str
            what instances to use for validation, from [train, test, train+test]

        Returns
        -------
        instances: list<str>
            instances to be used
        """
        instance_mode = mode.lower()
        if mode not in ['train', 'test', 'train+test']:
            raise ValueError(
                "%s not a valid option for instance_mode in validation." %
                mode)

        # Make sure if instances matter, than instances should be passed
        if ((instance_mode == 'train' and self.scen.train_insts == [None]) or
            (instance_mode == 'test' and self.scen.test_insts == [None])):
            self.logger.warning(
                "Instance mode is set to %s, but there are no "
                "%s-instances specified in the scenario. Setting instance mode to"
                "\"train+test\"!", instance_mode, instance_mode)
            instance_mode = 'train+test'

        instances = []
        if ((instance_mode == 'train' or instance_mode == 'train+test')
                and not self.scen.train_insts == [None]):
            instances.extend(self.scen.train_insts)
        if ((instance_mode == 'test' or instance_mode == 'train+test')
                and not self.scen.test_insts == [None]):
            instances.extend(self.scen.test_insts)
        return instances
Esempio n. 24
0
    def _get_mean_var_time(self, validator, traj, use_epm, rh):
        """
        Parameters
        ----------
        validator: Validator
            validator (smac-based)
        traj: List[Configuraton]
            trajectory to set in validator
        use_epm: bool
            validated or not (no need to use epm if validated)
        rh: RunHistory
            ??

        Returns
        -------
        mean, var

        times: List[float]
            times to plot (x-values)
        configs

        """
        # TODO kinda important: docstrings, what is this function doing?
        if validator:
            validator.traj = traj  # set trajectory
        time, configs = [], []

        if use_epm and not self.block_epm:
            for entry in traj:
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                # self.logger.debug('Time: %d Runs: %d', time[-1], len(rh.get_runs_for_config(configs[-1])))

            self.logger.debug(
                "Using %d samples (%d distinct) from trajectory.", len(time),
                len(set(configs)))

            # Initialize EPM
            if validator.epm:  # not log as validator epm is trained on cost, not log cost
                epm = validator.epm
            else:
                self.logger.debug(
                    "No EPM passed! Training new one from runhistory.")
                # Train random forest and transform training data (from given rh)
                # Not using validator because we want to plot uncertainties
                rh2epm = RunHistory2EPM4Cost(num_params=len(
                    self.scenario.cs.get_hyperparameters()),
                                             scenario=self.scenario)
                X, y = rh2epm.transform(rh)
                self.logger.debug(
                    "Training model with data of shape X: %s, y: %s",
                    str(X.shape), str(y.shape))

                types, bounds = get_types(self.scenario.cs,
                                          self.scenario.feature_array)
                epm = RandomForestWithInstances(
                    self.scenario.cs,
                    types=types,
                    bounds=bounds,
                    seed=self.rng.randint(MAXINT),
                    instance_features=self.scenario.feature_array,
                    ratio_features=1.0)
                epm.train(X, y)
            config_array = convert_configurations_to_array(configs)
            mean, var = epm.predict_marginalized_over_instances(config_array)
            var = np.zeros(mean.shape)
            # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs
            # This variance is computed in an outer loop.
        else:
            mean, var = [], []
            for entry in traj:
                #self.logger.debug(entry)
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                costs = _cost(configs[-1], rh,
                              rh.get_runs_for_config(configs[-1]))
                # self.logger.debug(len(costs), time[-1]
                if not costs:
                    time.pop()
                else:
                    mean.append(np.mean(costs))
                    var.append(0)  # No variance over instances
            mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape(
                -1, 1)
        return mean, var, time, configs
Esempio n. 25
0
            best_indices = np.argsort(meta_predictions)[-250:][::-1]
            observed_y = y[test_index][best_indices]
        else:
            # Do Blended BO for 250 iterations
            observed_X = []
            observed_y = []
            observed_i = []

            surpassed = None
            for iteration in range(0, 250):

                # We need to have observed at least 3 items for the model to be able to predict
                surr_predictions = np.zeros_like(test_index)
                if iteration > 2 and alpha < 1:
                    surr_estimator.train(
                        np.array(observed_X).astype(float),
                        np.array(observed_y))
                    mu, var = surr_estimator.predict(
                        np.array(surr_X.iloc[test_index]).astype(float))
                    mu = mu.reshape(-1)
                    var = var.reshape(-1)
                    sigma = np.sqrt(var)
                    diff = mu - np.max(observed_y)
                    Z = diff / sigma
                    ei = diff * norm.cdf(Z) + sigma * norm.pdf(Z)
                    surr_predictions = ei

                    # surr_predictions = surr_estimator.predict(np.array(surr_X.iloc[test_index]).astype(float))
                # print(iteration, "\t", np.std(surr_predictions), "\t", np.std(meta_predictions))

                m_corr, m_pvalue = kendalltau(meta_predictions, y[test_index])
Esempio n. 26
0
    def get_pred_surface(self, rh, X_scaled, conf_list: list,
                         contour_step_size):
        """fit epm on the scaled input dimension and
        return data to plot a contour plot of the empirical performance

        Parameters
        ----------
        rh: RunHistory
            runhistory
        X_scaled: np.array
            configurations in scaled 2dim
        conf_list: list
            list of Configuration objects
        contour_step_size: float
            step-size for contour

        Returns
        -------
        contour_data: (np.array, np.array, np.array)
            x, y, Z for contour plots
        """
        # use PCA to reduce features to also at most 2 dims
        scen = copy.deepcopy(self.scenario)  # pca changes feats
        if scen.feature_array.shape[1] > 2:
            self.logger.debug(
                "Use PCA to reduce features to from %d dim to 2 dim",
                scen.feature_array.shape[1])
            # perform PCA
            insts = scen.feature_dict.keys()
            feature_array = np.array([scen.feature_dict[i] for i in insts])
            feature_array = StandardScaler().fit_transform(feature_array)
            feature_array = PCA(n_components=2).fit_transform(feature_array)
            # inject in scenario-object
            scen.feature_array = feature_array
            scen.feature_dict = dict([(inst, feature_array[idx, :])
                                      for idx, inst in enumerate(insts)])
            scen.n_features = 2

        # convert the data to train EPM on 2-dim featurespace (for contour-data)
        self.logger.debug("Convert data for epm.")
        X, y, types = convert_data_for_epm(scenario=scen,
                                           runhistory=rh,
                                           impute_inactive_parameters=True,
                                           logger=self.logger)
        types = np.array(np.zeros((2 + scen.feature_array.shape[1])),
                         dtype=np.uint)
        num_params = len(scen.cs.get_hyperparameters())

        # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions
        conf_dict = {}
        # Remove forbidden clauses (this is necessary to enable the impute_inactive_values-method, see #226)
        cs_no_forbidden = copy.deepcopy(conf_list[0].configuration_space)
        cs_no_forbidden.forbidden_clauses = []
        for idx, c in enumerate(conf_list):
            c.configuration_space = cs_no_forbidden
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        # Debug compare elements:
        c1, c2 = {str(z) for z in X}, {str(z) for z in conf_dict.keys()}
        self.logger.debug(
            "{} elements not in both sets, {} elements in both sets, X (len {}) and conf_dict (len {}) "
            "(might be a problem related to forbidden clauses?)".format(
                len(c1 ^ c2), len(c1 & c2), len(c1 ^ c2), len(c1), len(c2)))
        # self.logger.debug("Elements: {}".format(str(c1 ^ c2)))

        X_trans = [
        ]  # X_trans is the same as X but with reduced 2-dim features (so shape is (N, 2) instead of (N, M))
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            # append scaled config + pca'ed features (total of 4 values) per config/feature-sample
            X_trans.append(
                np.concatenate((x_scaled_conf, x[num_params:]), axis=0))
        X_trans = np.array(X_trans)

        self.logger.debug(
            "Train random forest for contour-plot. Shape of X: {}, shape of X_trans: {}"
            .format(X.shape, X_trans.shape))
        self.logger.debug("Faking configspace to be able to train rf...")
        # We need to fake config-space bypass imputation of inactive values in random forest implementation
        fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint")

        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(fake_cs,
                                          types,
                                          bounds,
                                          seed=self.rng.randint(MAXINT),
                                          instance_features=np.array(
                                              scen.feature_array),
                                          ratio_features=1.0)

        start = time.time()
        model.train(X_trans, y)
        self.logger.debug("Fitting random forest took %f time",
                          time.time() - start)

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size),
                             np.arange(y_min, y_max, contour_step_size))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min,
                          x_max, y_min, y_max)
        self.logger.debug(
            "Predict on %d samples in grid to get surface (step-size: %f)",
            np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size)

        start = time.time()
        Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(),
                                                               yy.ravel()])
        Z = Z.reshape(xx.shape)
        self.logger.debug("Predicting random forest took %f time",
                          time.time() - start)

        return xx, yy, Z