Beispiel #1
0
    def _train(self, X: np.ndarray, y: np.ndarray):
        """Trains the random forest on X and y.

        Parameters
        ----------
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Input data points.
        Y : np.ndarray [n_samples, ]
            The corresponding target values.

        Returns
        -------
        self
        """

        self.X = X
        self.y = y.flatten()

        if self.n_points_per_tree <= 0:
            self.rf_opts.num_data_points_per_tree = self.X.shape[0]
        else:
            self.rf_opts.num_data_points_per_tree = self.n_points_per_tree
        self.rf = regression.binary_rss_forest()
        self.rf.options = self.rf_opts
        data = self._init_data_container(self.X, self.y)
        self.rf.fit(data, rng=self.rng)
        return self
    def _train(self, X: np.ndarray, y: np.ndarray) -> 'RandomForestWithInstancesHPO':
        """Trains the random forest on X and y.

        Parameters
        ----------
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Input data points.
        y : np.ndarray [n_samples, ]
            The corresponding target values.

        Returns
        -------
        self
        """

        X = self._impute_inactive(X)
        self.X = X
        self.y = y.flatten()

        cfg = self._get_configuration_space()

        # Draw 50 random samples and use best according to 10 CV
        best_error = None
        best_config = None
        if X.shape[0] > 3:
            for i in range(self.n_iters):
                if i == 0:
                    configuration = cfg.get_default_configuration()
                else:
                    configuration = cfg.sample_configuration()
                n_splits = min(X.shape[0], self.n_splits)
                kf = KFold(n_splits=n_splits)
                error = 0.0
                for train_index, test_index in kf.split(X):
                    error += self._eval_rf(
                        c=configuration,
                        X=X[train_index, :],
                        y=y[train_index],
                        X_test=X[test_index, :],
                        y_test=y[test_index],
                    )
                self.logger.debug(error)
                if best_error is None or error < best_error:
                    best_config = configuration
                    best_error = error
        else:
            best_config = cfg.get_default_configuration()

        self.rf_opts = self._set_conf(
            c=best_config, n_features=self.X.shape[1], num_data_points=X.shape[0],
        )
        self._set_hypers(best_config)

        self.logger.debug("Use %s" % str(self.rf_opts))
        self.rf = regression.binary_rss_forest()
        self.rf.options = self.rf_opts
        data = self._init_data_container(self.X, self.y)
        self.rf.fit(data, rng=self.rng)

        return self
Beispiel #3
0
    def __init__(self,
                 num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
Beispiel #4
0
    def __init__(self, num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
    def test_prediction(self):
        the_forest = reg.binary_rss_forest()

        the_forest.options.num_trees = 64
        the_forest.options.do_bootstrapping = True
        the_forest.options.num_data_points_per_tree = 200

        self.assertEqual(the_forest.options.num_trees, 64)
        self.assertTrue(the_forest.options.do_bootstrapping)
        self.assertEqual(the_forest.options.num_data_points_per_tree, 200)

        the_forest.fit(self.data, self.rng)

        the_forest.predict(self.data.retrieve_data_point(0))
Beispiel #6
0
    def __init__(self,
                 X_init: np.ndarray,
                 Y_init: np.ndarray,
                 num_trees: int = 30,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = 0,
                 seed: int = None) -> None:
        """
        Interface to random forests for Bayesian optimization based on pyrfr package which due to the random splitting
        gives better uncertainty estimates than the sklearn random forest.

        Dependencies:
            AutoML rfr (https://github.com/automl/random_forest_run)

        :param X_init: Initial input data points to train the model
        :param Y_init: Initial target values
        :param num_trees: Specifies the number of trees to build the random forest
        :param do_bootstrapping: Defines if we use boostrapping for the individual trees or not
        :param n_points_per_tree: Specifies the number of points for each individual tree (0 mean no restriction)
        :param seed: Used to seed the random number generator for the random forest (None means random seed)
        """
        super().__init__()

        # Set random number generator for the random forest
        if seed is None:
            seed = np.random.randint(10000)
        self.reg_rng = reg.default_random_engine(seed)

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping

        self.rf.options.num_data_points_per_tree = n_points_per_tree

        self._X = X_init
        self._Y = Y_init

        if self.n_points_per_tree == 0:
            self.rf.options.num_data_points_per_tree = X_init.shape[0]

        data = reg.default_data_container(self._X.shape[1])

        for row_X, row_y in zip(X_init, Y_init):
            data.add_data_point(row_X, row_y)

        self.rf.fit(data, self.reg_rng)
Beispiel #7
0
	def setUp(self):
		data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/'
		self.data = reg.default_data_container(64)
		self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv')
		

		self.forest = reg.binary_rss_forest()
		self.forest.options.num_trees = 64
		self.forest.options.do_bootstrapping = True
		self.forest.options.num_data_points_per_tree = 200

		self.assertEqual(self.forest.options.num_trees, 64)
		self.assertTrue (self.forest.options.do_bootstrapping)
		self.assertEqual(self.forest.options.num_data_points_per_tree, 200)

		self.rng = reg.default_random_engine(1)
    def test_first_nearest_neightbor(self):
        # if no bootstrapping is done, the tree gets all the data points,
        # all features are used for every split and all datapoints are unique,
        # a single tree will perfectly recall the datapoints
        the_forest = reg.binary_rss_forest()
        the_forest.options.num_trees = 1
        the_forest.options.do_bootstrapping = False
        the_forest.options.num_data_points_per_tree = self.data.num_data_points(
        )
        the_forest.options.tree_opts.max_features = self.data.num_features()

        the_forest.fit(self.data, self.rng)

        self.assertEqual(the_forest.num_trees(), 1)
        for i in range(self.data.num_data_points()):
            self.assertEqual(
                the_forest.predict(self.data.retrieve_data_point(i)),
                self.data.response(i))
Beispiel #9
0
    def _eval_rf(
        self,
        c: Configuration,
        X: np.ndarray,
        y: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray,
    ) -> float:
        """Evaluate random forest configuration on train/test data.

        Parameters
        ----------
        c : Configuration
            Random forest configuration to evaluate on the train/test data
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Training features
        y : np.ndarray [n_samples, ]
            Training targets
        X_test : np.ndarray [n_samples, n_features (config + instance features)]
            Validation features
        y_test : np.ndarray [n_samples, ]
            Validation targets

        Returns
        -------
        float
        """
        opts = self._set_conf(c,
                              n_features=X.shape[1],
                              num_data_points=X.shape[0])
        rng = regression.default_random_engine(1)
        rf = regression.binary_rss_forest()
        rf.options = opts
        data = self._init_data_container(X, y)
        rf.fit(data, rng=rng)

        loss = 0
        for row, lab in zip(X_test, y_test):
            m, v = rf.predict_mean_var(row)
            std = max(1e-8, np.sqrt(v))
            nllh = -scst.norm(loc=m, scale=std).logpdf(lab)
            loss += nllh

        return loss
Beispiel #10
0
    def __init__(self,
                 num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 compute_oob_error=False,
                 return_total_variance=True,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        compute_oob_error: bool
            Turns on / off calculation of out-of-bag error. Default: False
        return_total_variance: bool
            Return law of total variance (mean of variances + variance of means, if True)
            or explained variance (variance of means, if False). Default: True
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees
        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
        self.rf.options.compute_oob_error = compute_oob_error
        self.rf.options.compute_law_of_total_variance = return_total_variance
    def setUp(self):
        self.X = [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.],
                  [0., 0., 1.], [0., 0., 1.], [0., 1., 0.], [0., 1., 0.],
                  [0., 1., 0.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.],
                  [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 1.],
                  [1., 0., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 0.],
                  [1., 1., 0.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.]]
        self.y = [[50], [50], [50], [.2], [.2], [.2], [9], [9], [9], [9.2],
                  [9.2], [9.2], [500], [500], [500], [10.2], [10.2], [10.2],
                  [109.], [109.], [109.], [100], [100], [100]]
        self.y_dual = list(map(lambda x: [math.log10(x[0]), x[0]], self.y))
        bounds = [(0, float('nan')), (0, float('nan')), (0, float('nan'))]

        def init_data(X, y, bounds):
            data = reg.default_data_container(len(X[0]))

            for i, (mn, mx) in enumerate(bounds):
                if math.isnan(mx):
                    data.set_type_of_feature(i, mn)
                else:
                    data.set_bounds_of_feature(i, mn, mx)

            for row_X, row_y in zip(X, y):
                data.add_data_point(row_X, row_y)
            return data

        self.data = init_data(self.X, self.y, bounds)
        self.data_dual = init_data(self.X, self.y_dual, bounds)

        self.forest = reg.binary_rss_forest()
        self.forest.options.num_trees = 64
        self.forest.options.do_bootstrapping = True
        self.forest.options.num_data_points_per_tree = 200
        self.forest.options.compute_law_of_total_variance = True

        self.assertEqual(self.forest.options.num_trees, 64)
        self.assertTrue(self.forest.options.do_bootstrapping)
        self.assertEqual(self.forest.options.num_data_points_per_tree, 200)
        self.assertTrue(self.forest.options.compute_law_of_total_variance)

        self.rng = reg.default_random_engine(1)
    def test_pickling(self):

        the_forest = reg.binary_rss_forest()
        the_forest.options.num_trees = 16
        the_forest.options.do_bootstrapping = True
        the_forest.options.num_data_points_per_tree = self.data.num_data_points(
        )

        self.assertEqual(the_forest.options.num_trees, 16)

        the_forest.fit(self.data, self.rng)

        with tempfile.NamedTemporaryFile(mode='w+b', delete=False) as f:
            fname = f.name
            pickle.dump(the_forest, f)

        with open(fname, 'r+b') as fh:
            a_second_forest = pickle.load(fh)
        os.remove(fname)

        for i in range(self.data.num_data_points()):
            d = self.data.retrieve_data_point(i)
            self.assertEqual(the_forest.predict(d), a_second_forest.predict(d))
Beispiel #13
0
if __name__ == "__main__":
    cs = ConfigurationSpace()
    learning_rate = UniformFloatHyperparameter("learning_rate", 1e-4, 5e-3, default_value=3e-4)
    cs.add_hyperparameter(learning_rate)

    n_layer1 = UniformIntegerHyperparameter("n_layer1", 5, 50, default_value=32)
    cs.add_hyperparameter(n_layer1)

    n_layer2 = UniformIntegerHyperparameter("n_layer2", 30, 80, default_value=64)
    cs.add_hyperparameter(n_layer2)

    batch_size = UniformIntegerHyperparameter("batch_size", 10, 500, default_value=200)
    cs.add_hyperparameter(batch_size)

    types, bounds = get_types(cs)
    reg = regression.binary_rss_forest()
    rf_opts = regression.forest_opts()
    rf_opts.num_trees = 10
    rf_opts.do_bootstrapping = True

    model = RandomForestWithInstances(types=types, bounds=bounds)
    x = np.array([[0.78105907, 0.33860037, 0.72826097, 0.02941158],
                  [0.81160897, 0.63147998, 0.72826097, 0.04901943],
                  [0.27800406, 0.36616871, 0.16304333, 0.24509794],
                  [0.41242362, 0.37351241, 0.11956505, 0.4607843],
                  [0.70162934, 0.15819312, 0.51086957, 0.10784298],
                  [0.53869654, 0.86662495, 0.27173903, 0.22549009],
                  [0.53665988, 0.68576624, 0.81521753, 0.06862728],
                  [0.72199594, 0.18900731, 0.75000011, 0.36274504]], dtype=np.float64)
    y = np.array([0.544481, 2.34456, 0.654629, 0.576376, 0.603501, 0.506214, 0.416664, 0.483639])
    print(x.dtype)
Beispiel #14
0
num_points = 8

features = np.array([np.linspace(-1, 1, num_points)]).transpose()
x2 = np.array([np.linspace(-1, 1, 100)]).transpose()
responses = np.exp(-np.power(features / 0.3, 2)).flatten(
) + 0.05 * np.random.randn(features.shape[0])

data = reg.default_data_container(1)

for f, r in zip(features, responses):
    data.add_data_point(f, r)

rng = reg.default_random_engine()

# create an instance of a regerssion forest using binary splits and the RSS loss
the_forest = reg.binary_rss_forest()
the_forest.options.num_trees = 64
the_forest.options.num_data_points_per_tree = num_points
the_forest.options.tree_opts.min_samples_in_leaf = 1

the_forest.fit(data, rng)

fig, (ax1, ax2, ax3) = plt.subplots(3, sharex=True)

predictions = np.array([the_forest.predict_mean_var(x) for x in x2])
ax1.fill_between(x2[:, 0],
                 predictions[:, 0] - predictions[:, 1],
                 predictions[:, 0] + predictions[:, 1],
                 alpha=0.3)
ax1.plot(x2, predictions[:, 0])
ax1.scatter(features, responses)