def test_predict_marginalized_over_instances_mocked(self, rf_mock):
        """Use mock to count the number of calls to predict()"""
        class SideEffect(object):
            def __call__(self, X):
                # Numpy array of number 0 to X.shape[0]
                rval = np.array(list(range(X.shape[0]))).reshape((-1, 1))
                # Return mean and variance
                return rval, rval

        rf_mock.side_effect = SideEffect()

        rs = np.random.RandomState(1)
        F = rs.rand(10, 5)

        model = RandomForestWithInstances(
            configspace=self._get_cs(10),
            types=np.zeros((15, ), dtype=np.uint),
            instance_features=F,
            bounds=list(map(lambda x: (0, 10), range(10))),
            seed=1,
        )
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.randint(1, size=(len(X) * len(F), 1)) * 1.
        X_ = rs.rand(200, 15)
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(rs.rand(
            11, 10))
        # expected to be 0 as the predict is replaced by manual unloggin the trees
        self.assertEqual(rf_mock.call_count, 0)
        self.assertEqual(means.shape, (11, 1))
        self.assertEqual(vars.shape, (11, 1))
        for i in range(11):
            self.assertEqual(means[i], 0.)
            self.assertEqual(vars[i], 1.e-10)
Beispiel #2
0
    def test_predict_marginalized_over_instances_mocked(self, rf_mock):
        """Use mock to count the number of calls to predict()"""
        class SideEffect(object):
            def __call__(self, X):
                # Numpy array of number 0 to X.shape[0]
                rval = np.array(list(range(X.shape[0]))).reshape((-1, 1))
                # Return mean and variance
                return rval, rval

        rf_mock.side_effect = SideEffect()

        rs = np.random.RandomState(1)
        F = rs.rand(10, 5)

        model = RandomForestWithInstances(np.zeros((15, ), dtype=np.uint),
                                          instance_features=F,
                                          bounds=np.array(list(
                                              map(lambda x: (0, 10),
                                                  range(10))),
                                                          dtype=object))
        means, vars = model.predict_marginalized_over_instances(rs.rand(
            11, 10))
        self.assertEqual(rf_mock.call_count, 11)
        self.assertEqual(means.shape, (11, 1))
        self.assertEqual(vars.shape, (11, 1))
        for i in range(11):
            self.assertEqual(means[i], 4.5)
            self.assertEqual(vars[i], 4.5)
Beispiel #3
0
    def test_predict_marginalized_over_instances(self):
        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.rand(len(X) * len(F), 1)
        X_ = rs.rand(200, 15)

        model = RandomForestWithInstances(np.zeros((15, ), dtype=np.uint),
                                          instance_features=F)
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(X)
        self.assertEqual(means.shape, (20, 1))
        self.assertEqual(vars.shape, (20, 1))
    def test_predict_marginalized_over_instances(self):
        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        F = rs.rand(10, 5)
        Y = rs.rand(len(X) * len(F), 1)
        X_ = rs.rand(200, 15)

        model = RandomForestWithInstances(
            configspace=self._get_cs(10),
            types=np.zeros((15, ), dtype=np.uint),
            instance_features=F,
            bounds=list(map(lambda x: (0, 10), range(10))),
            seed=1,
        )
        model.train(X_, Y)
        means, vars = model.predict_marginalized_over_instances(X)
        self.assertEqual(means.shape, (20, 1))
        self.assertEqual(vars.shape, (20, 1))
Beispiel #5
0
    def _get_mean_var_time(self, validator, traj, use_epm, rh):
        """
        Parameters
        ----------
        validator: Validator
            validator (smac-based)
        traj: List[Configuraton]
            trajectory to set in validator
        use_epm: bool
            validated or not (no need to use epm if validated)
        rh: RunHistory
            ??

        Returns
        -------
        mean, var

        times: List[float]
            times to plot (x-values)
        configs

        """
        # TODO kinda important: docstrings, what is this function doing?
        if validator:
            validator.traj = traj  # set trajectory
        time, configs = [], []

        if use_epm and not self.block_epm:
            for entry in traj:
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                # self.logger.debug('Time: %d Runs: %d', time[-1], len(rh.get_runs_for_config(configs[-1])))

            self.logger.debug(
                "Using %d samples (%d distinct) from trajectory.", len(time),
                len(set(configs)))

            # Initialize EPM
            if validator.epm:  # not log as validator epm is trained on cost, not log cost
                epm = validator.epm
            else:
                self.logger.debug(
                    "No EPM passed! Training new one from runhistory.")
                # Train random forest and transform training data (from given rh)
                # Not using validator because we want to plot uncertainties
                rh2epm = RunHistory2EPM4Cost(num_params=len(
                    self.scenario.cs.get_hyperparameters()),
                                             scenario=self.scenario)
                X, y = rh2epm.transform(rh)
                self.logger.debug(
                    "Training model with data of shape X: %s, y: %s",
                    str(X.shape), str(y.shape))

                types, bounds = get_types(self.scenario.cs,
                                          self.scenario.feature_array)
                epm = RandomForestWithInstances(
                    self.scenario.cs,
                    types=types,
                    bounds=bounds,
                    seed=self.rng.randint(MAXINT),
                    instance_features=self.scenario.feature_array,
                    ratio_features=1.0)
                epm.train(X, y)
            config_array = convert_configurations_to_array(configs)
            mean, var = epm.predict_marginalized_over_instances(config_array)
            var = np.zeros(mean.shape)
            # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs
            # This variance is computed in an outer loop.
        else:
            mean, var = [], []
            for entry in traj:
                #self.logger.debug(entry)
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                costs = _cost(configs[-1], rh,
                              rh.get_runs_for_config(configs[-1]))
                # self.logger.debug(len(costs), time[-1]
                if not costs:
                    time.pop()
                else:
                    mean.append(np.mean(costs))
                    var.append(0)  # No variance over instances
            mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape(
                -1, 1)
        return mean, var, time, configs
    def get_pred_surface(self, rh, X_scaled, conf_list: list,
                         contour_step_size):
        """fit epm on the scaled input dimension and
        return data to plot a contour plot of the empirical performance

        Parameters
        ----------
        rh: RunHistory
            runhistory
        X_scaled: np.array
            configurations in scaled 2dim
        conf_list: list
            list of Configuration objects

        Returns
        -------
        contour_data: (np.array, np.array, np.array)
            x, y, Z for contour plots
        """
        # use PCA to reduce features to also at most 2 dims
        scen = copy.deepcopy(self.scenario)  # pca changes feats
        if scen.feature_array.shape[1] > 2:
            self.logger.debug(
                "Use PCA to reduce features to from %d dim to 2 dim",
                scen.feature_array.shape[1])
            # perform PCA
            insts = scen.feature_dict.keys()
            feature_array = np.array([scen.feature_dict[i] for i in insts])
            feature_array = StandardScaler().fit_transform(feature_array)
            feature_array = PCA(n_components=2).fit_transform(feature_array)
            # inject in scenario-object
            scen.feature_array = feature_array
            scen.feature_dict = dict([(inst, feature_array[idx, :])
                                      for idx, inst in enumerate(insts)])
            scen.n_features = 2

        # convert the data to train EPM on 2-dim featurespace (for contour-data)
        self.logger.debug("Convert data for epm.")
        X, y, types = convert_data_for_epm(scenario=scen,
                                           runhistory=rh,
                                           logger=self.logger)
        types = np.array(np.zeros((2 + scen.feature_array.shape[1])),
                         dtype=np.uint)
        num_params = len(scen.cs.get_hyperparameters())

        # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions
        conf_dict = {}
        for idx, c in enumerate(conf_list):
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        X_trans = []
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            # append scaled config + pca'ed features (total of 4 values) per config/feature-sample
            X_trans.append(
                np.concatenate((x_scaled_conf, x[num_params:]), axis=0))
        X_trans = np.array(X_trans)

        self.logger.debug("Train random forest for contour-plot.")
        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(types=types,
                                          bounds=bounds,
                                          instance_features=np.array(
                                              scen.feature_array),
                                          ratio_features=1.0)

        start = time.time()
        model.train(X_trans, y)
        self.logger.debug("Fitting random forest took %f time",
                          time.time() - start)

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size),
                             np.arange(y_min, y_max, contour_step_size))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min,
                          x_max, y_min, y_max)
        self.logger.debug(
            "Predict on %d samples in grid to get surface (step-size: %f)",
            np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size)

        start = time.time()
        Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(),
                                                               yy.ravel()])
        Z = Z.reshape(xx.shape)
        self.logger.debug("Predicting random forest took %f time",
                          time.time() - start)

        return xx, yy, Z
    def get_pred_surface(self, rh, X_scaled, conf_list: list,
                         contour_step_size):
        """fit epm on the scaled input dimension and
        return data to plot a contour plot of the empirical performance

        Parameters
        ----------
        rh: RunHistory
            runhistory
        X_scaled: np.array
            configurations in scaled 2dim
        conf_list: list
            list of Configuration objects
        contour_step_size: float
            step-size for contour

        Returns
        -------
        contour_data: (np.array, np.array, np.array)
            x, y, Z for contour plots
        """
        # use PCA to reduce features to also at most 2 dims
        scen = copy.deepcopy(self.scenario)  # pca changes feats
        if scen.feature_array.shape[1] > 2:
            self.logger.debug(
                "Use PCA to reduce features to from %d dim to 2 dim",
                scen.feature_array.shape[1])
            # perform PCA
            insts = scen.feature_dict.keys()
            feature_array = np.array([scen.feature_dict[i] for i in insts])
            feature_array = StandardScaler().fit_transform(feature_array)
            feature_array = PCA(n_components=2).fit_transform(feature_array)
            # inject in scenario-object
            scen.feature_array = feature_array
            scen.feature_dict = dict([(inst, feature_array[idx, :])
                                      for idx, inst in enumerate(insts)])
            scen.n_features = 2

        # convert the data to train EPM on 2-dim featurespace (for contour-data)
        self.logger.debug("Convert data for epm.")
        X, y, types = convert_data_for_epm(scenario=scen,
                                           runhistory=rh,
                                           impute_inactive_parameters=True,
                                           logger=self.logger)
        types = np.array(np.zeros((2 + scen.feature_array.shape[1])),
                         dtype=np.uint)
        num_params = len(scen.cs.get_hyperparameters())

        # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions
        conf_dict = {}
        # Remove forbidden clauses (this is necessary to enable the impute_inactive_values-method, see #226)
        cs_no_forbidden = copy.deepcopy(conf_list[0].configuration_space)
        cs_no_forbidden.forbidden_clauses = []
        for idx, c in enumerate(conf_list):
            c.configuration_space = cs_no_forbidden
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        # Debug compare elements:
        c1, c2 = {str(z) for z in X}, {str(z) for z in conf_dict.keys()}
        self.logger.debug(
            "{} elements not in both sets, {} elements in both sets, X (len {}) and conf_dict (len {}) "
            "(might be a problem related to forbidden clauses?)".format(
                len(c1 ^ c2), len(c1 & c2), len(c1 ^ c2), len(c1), len(c2)))
        # self.logger.debug("Elements: {}".format(str(c1 ^ c2)))

        X_trans = [
        ]  # X_trans is the same as X but with reduced 2-dim features (so shape is (N, 2) instead of (N, M))
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            # append scaled config + pca'ed features (total of 4 values) per config/feature-sample
            X_trans.append(
                np.concatenate((x_scaled_conf, x[num_params:]), axis=0))
        X_trans = np.array(X_trans)

        self.logger.debug(
            "Train random forest for contour-plot. Shape of X: {}, shape of X_trans: {}"
            .format(X.shape, X_trans.shape))
        self.logger.debug("Faking configspace to be able to train rf...")
        # We need to fake config-space bypass imputation of inactive values in random forest implementation
        fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint")

        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(fake_cs,
                                          types,
                                          bounds,
                                          seed=self.rng.randint(MAXINT),
                                          instance_features=np.array(
                                              scen.feature_array),
                                          ratio_features=1.0)

        start = time.time()
        model.train(X_trans, y)
        self.logger.debug("Fitting random forest took %f time",
                          time.time() - start)

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size),
                             np.arange(y_min, y_max, contour_step_size))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min,
                          x_max, y_min, y_max)
        self.logger.debug(
            "Predict on %d samples in grid to get surface (step-size: %f)",
            np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size)

        start = time.time()
        Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(),
                                                               yy.ravel()])
        Z = Z.reshape(xx.shape)
        self.logger.debug("Predicting random forest took %f time",
                          time.time() - start)

        return xx, yy, Z
Beispiel #8
0
    def get_pred_surface(self, X_scaled, conf_list: list):
        '''
            fit epm on the scaled input dimension and
            return data to plot a contour plot

            Parameters
            ----------
            X_scaled: np.array
                configurations in scaled 2dim
            conf_list: list
                list of Configuration objects

            Returns
            -------
            np.array, np.array, np.array
                x,y,Z for contour plots

        '''

        # use PCA to reduce features to also at most 2 dims
        n_feats = self.scenario.feature_array.shape[1]
        if n_feats > 2:
            self.logger.debug("Use PCA to reduce features to 2dim")
            insts = self.scenario.feature_dict.keys()
            feature_array = np.array([self.scenario.feature_dict[inst] for inst in insts])
            ss = StandardScaler()
            self.scenario.feature_array = ss.fit_transform(feature_array)
            pca = PCA(n_components=2)
            feature_array = pca.fit_transform(feature_array)
            n_feats = feature_array.shape[1]
            self.scenario.feature_array = feature_array
            self.scenario.feature_dict = dict([(inst, feature_array[idx,:]) for idx, inst in enumerate(insts)])
            self.scenario.n_features = 2

        # Create new rh with only wanted configs
        new_rh = RunHistory(average_cost)
        for rh in self.runhistories:
            for key, value in rh.data.items():
                config = rh.ids_config[key.config_id]
                if config in self.configs_to_plot:
                    config_id, instance, seed = key
                    cost, time, status, additional_info = value
                    new_rh.add(config, cost, time, status, instance_id=instance,
                               seed=seed, additional_info=additional_info)
        self.relevant_rh = new_rh

        X, y, types = convert_data(scenario=self.scenario,
                                   runhistory=new_rh)

        types = np.array(np.zeros((2+n_feats)), dtype=np.uint)

        num_params = len(self.scenario.cs.get_hyperparameters())

        # impute missing values in configs
        conf_dict = {}
        for idx, c in enumerate(conf_list):
            conf_list[idx] = impute_inactive_values(c)
            conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :]

        X_trans = []
        for x in X:
            x_scaled_conf = conf_dict[str(x[:num_params])]
            x_new = np.concatenate(
                        (x_scaled_conf, x[num_params:]), axis=0)
            X_trans.append(x_new)
        X_trans = np.array(X_trans)

        bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object)
        model = RandomForestWithInstances(types=types, bounds=bounds,
                                          instance_features=np.array(self.scenario.feature_array),
                                          ratio_features=1.0)

        model.train(X_trans, y)

        self.logger.debug("RF fitted")

        plot_step = self.contour_step_size

        x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
        y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))

        self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f" %(x_min, x_max, y_min, y_max))

        self.logger.debug("Predict on %d samples in grid to get surface" %(np.c_[xx.ravel(), yy.ravel()].shape[0]))
        Z, _ = model.predict_marginalized_over_instances(
            np.c_[xx.ravel(), yy.ravel()])

        Z = Z.reshape(xx.shape)

        return xx, yy, Z
Beispiel #9
0
    def plot_cost_over_time(self,
                            rh,
                            traj,
                            output="performance_over_time.png",
                            validator=None):
        """ Plot performance over time, using all trajectory entries
            with max_time = wallclock_limit or (if inf) the highest
            recorded time

            Parameters
            ----------
            rh: RunHistory
                runhistory to use
            traj: List
                trajectory to take times/incumbents from
            output: str
                path to output-png
            epm: RandomForestWithInstances
                emperical performance model (expecting trained on all runs)
        """
        self.logger.debug("Estimating costs over time for best run.")
        validator.traj = traj  # set trajectory
        time, configs = [], []

        for entry in traj:
            time.append(entry["wallclock_time"])
            configs.append(entry["incumbent"])

        self.logger.debug("Using %d samples (%d distinct) from trajectory.",
                          len(time), len(set(configs)))

        if validator.epm:  # not log as validator epm is trained on cost, not log cost
            epm = validator.epm
        else:
            self.logger.debug(
                "No EPM passed! Training new one from runhistory.")
            # Train random forest and transform training data (from given rh)
            # Not using validator because we want to plot uncertainties
            rh2epm = RunHistory2EPM4Cost(num_params=len(
                self.scenario.cs.get_hyperparameters()),
                                         scenario=self.scenario)
            X, y = rh2epm.transform(rh)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))

            types, bounds = get_types(self.scenario.cs,
                                      self.scenario.feature_array)
            epm = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=self.scenario.feature_array,
                #seed=self.rng.randint(MAXINT),
                ratio_features=1.0)
            epm.train(X, y)

        ## not necessary right now since the EPM only knows the features
        ## of the training instances
        # use only training instances
        #=======================================================================
        # if self.scenario.feature_dict:
        #     feat_array = []
        #     for inst in self.scenario.train_insts:
        #         feat_array.append(self.scenario.feature_dict[inst])
        #     backup_features_epm = epm.instance_features
        #     epm.instance_features = np.array(feat_array)
        #=======================================================================

        # predict performance for all configurations in trajectory
        config_array = convert_configurations_to_array(configs)
        mean, var = epm.predict_marginalized_over_instances(config_array)

        #=======================================================================
        # # restore feature array in epm
        # if self.scenario.feature_dict:
        #     epm.instance_features = backup_features_epm
        #=======================================================================

        mean = mean[:, 0]
        var = var[:, 0]
        uncertainty_upper = mean + np.sqrt(var)
        uncertainty_lower = mean - np.sqrt(var)
        if self.scenario.run_obj == 'runtime':  # We have to clip at 0 as we want to put y on the logscale
            uncertainty_lower[uncertainty_lower < 0] = 0
            uncertainty_upper[uncertainty_upper < 0] = 0

        # plot
        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax.set_ylabel('performance')
        ax.set_xlabel('time [sec]')
        ax.plot(time, mean, 'r-', label="estimated performance")
        ax.fill_between(time,
                        uncertainty_upper,
                        uncertainty_lower,
                        alpha=0.8,
                        label="standard deviation")
        ax.set_xscale("log", nonposx='clip')
        if self.scenario.run_obj == 'runtime':
            ax.set_yscale('log')

        # ax.set_ylim(min(mean)*0.8, max(mean)*1.2)
        # start after 1% of the configuration budget
        ax.set_xlim(min(time) + (max(time) - min(time)) * 0.01, max(time))

        ax.legend()
        plt.tight_layout()
        fig.savefig(output)
        plt.close(fig)