Example #1
0
 def test_projection(self, ):
     # test the projection functionality of forests
     # test that the estimator calcualtes var correctly
     np.set_printoptions(precision=10, suppress=True)
     config = self._get_base_config()
     config['honest'] = True
     config['max_depth'] = 0
     config['inference'] = True
     config['n_estimators'] = 100
     config['subforest_size'] = 2
     config['max_samples'] = .5
     config['n_jobs'] = 1
     n_features = 2
     # test api
     n = 100
     random_state = 123
     X, y, truth = self._get_regression_data(n, n_features, random_state)
     forest = RegressionForest(**config).fit(X, y)
     mean, var = forest.predict_and_var(X)
     mean = mean.flatten()
     var = var.flatten()
     y = np.hstack([y, y])
     truth = np.hstack([truth, truth])
     forest = RegressionForest(**config).fit(X, y)
     projector = np.ones((X.shape[0], 2)) / 2.0
     mean_proj, var_proj = forest.predict_projection_and_var(X, projector)
     np.testing.assert_array_equal(mean_proj, mean)
     np.testing.assert_array_equal(var_proj, var)
     np.testing.assert_array_equal(
         var_proj, forest.predict_projection_var(X, projector))
     np.testing.assert_array_equal(mean_proj,
                                   forest.predict_projection(X, projector))
     return
Example #2
0
    def test_pickling(self,):

        n_features = 2
        n = 10
        random_state = 123
        X, y, _ = self._get_regression_data(n, n_features, random_state)

        forest = RegressionForest(n_estimators=4, warm_start=True, random_state=123).fit(X, y)
        forest.n_estimators = 8
        forest.fit(X, y)
        pred1 = forest.predict(X)

        joblib.dump(forest, 'forest.jbl')
        loaded_forest = joblib.load('forest.jbl')
        np.testing.assert_equal(loaded_forest.n_estimators, forest.n_estimators)
        np.testing.assert_allclose(loaded_forest.predict(X), pred1)
def monte_carlo():
    n = 5000
    d = 5
    x_grid = np.linspace(-1, 1, 1000)
    X_test = np.hstack(
        [x_grid.reshape(-1, 1),
         np.random.normal(size=(1000, d - 1))])
    coverage = []
    exp_dict = {'point': [], 'low': [], 'up': []}
    for it in range(100):
        print(it)
        X = np.random.normal(0, 1, size=(n, d))
        y = X[:, 0] + np.random.normal(size=(n, ))
        est = RegressionForest(n_estimators=1000, verbose=1)
        est.fit(X, y)
        point = est.predict(X_test)
        low, up = est.predict_interval(X_test, alpha=0.05)
        coverage.append((low <= x_grid) & (x_grid <= up))
        exp_dict['point'].append(point)
        exp_dict['low'].append(low)
        exp_dict['up'].append(up)

    if not os.path.exists('figures'):
        os.makedirs('figures')
    if not os.path.exists(os.path.join("figures", 'honestforest')):
        os.makedirs(os.path.join("figures", 'honestforest'))

    plt.figure()
    plt.plot(x_grid, np.mean(coverage, axis=0))
    plt.savefig('figures/honestforest/coverage.png')

    plt.figure()
    plt.plot(x_grid,
             np.sqrt(np.mean((np.array(exp_dict['point']) - x_grid)**2,
                             axis=0)),
             label='RMSE')
    plt.savefig('figures/honestforest/rmse.png')

    plt.figure()
    plt.plot(x_grid,
             np.mean(np.array(exp_dict['up']) - np.array(exp_dict['low']),
                     axis=0),
             label='length')
    plt.savefig('figures/honestforest/length.png')
Example #4
0
    def test_warm_start(self, ):
        n_features = 2
        n = 100
        random_state = 123
        X, y, _ = self._get_regression_data(n, n_features, random_state)

        for inference in [True, False]:
            forest = RegressionForest(n_estimators=4,
                                      inference=inference,
                                      warm_start=True,
                                      random_state=123).fit(X, y)
            forest.n_estimators = 8
            forest.fit(X, y)
            pred1 = forest.predict(X)
            inds1 = forest.get_subsample_inds()
            tree_states1 = [t.random_state for t in forest]

            forest = RegressionForest(n_estimators=8,
                                      inference=inference,
                                      warm_start=True,
                                      random_state=123).fit(X, y)
            pred2 = forest.predict(X)
            inds2 = forest.get_subsample_inds()
            tree_states2 = [t.random_state for t in forest]

            np.testing.assert_allclose(pred1, pred2)
            np.testing.assert_allclose(inds1, inds2)
            np.testing.assert_allclose(tree_states1, tree_states2)
        return
Example #5
0
    def test_raise_exceptions(self, ):
        # test that we raise errors in mishandled situations.
        n_features = 2
        n = 10
        random_state = 123
        X, y, truth = self._get_regression_data(n, n_features, random_state)
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=20).fit(X, y[:4])
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=20,
                                      subforest_size=3).fit(X, y)
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=20,
                                      inference=True,
                                      max_samples=.6).fit(X, y)
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=20,
                                      max_samples=20).fit(X, y)
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=20,
                                      max_samples=1.2).fit(X, y)
        with np.testing.assert_raises(ValueError):
            forest = RegressionForest(n_estimators=4,
                                      warm_start=True,
                                      inference=True).fit(X, y)
            forest.inference = False
            forest.n_estimators = 8
            forest.fit(X, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  criterion='peculiar').fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4, max_depth=-1).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  min_samples_split=-1).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  min_samples_leaf=-1).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  min_weight_fraction_leaf=-1.0).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  min_var_fraction_leaf=-1.0).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4, max_features=10).fit(X, y, y)
        with np.testing.assert_raises(ValueError):
            forest = CausalForest(n_estimators=4,
                                  min_balancedness_tol=.55).fit(X, y, y)

        return
Example #6
0
 def test_non_standard_input(self, ):
     # test that the estimator accepts lists, tuples and pandas data frames
     n_features = 2
     n = 100
     random_state = 123
     X, y, truth = self._get_regression_data(n, n_features, random_state)
     forest = RegressionForest(n_estimators=20, n_jobs=1,
                               random_state=123).fit(X, y)
     pred = forest.predict(X)
     forest = RegressionForest(n_estimators=20, n_jobs=1,
                               random_state=123).fit(tuple(X), tuple(y))
     np.testing.assert_allclose(pred, forest.predict(tuple(X)))
     forest = RegressionForest(n_estimators=20, n_jobs=1,
                               random_state=123).fit(list(X), list(y))
     np.testing.assert_allclose(pred, forest.predict(list(X)))
     forest = RegressionForest(n_estimators=20, n_jobs=1,
                               random_state=123).fit(
                                   pd.DataFrame(X), pd.DataFrame(y))
     np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X)))
     forest = RegressionForest(n_estimators=20, n_jobs=1,
                               random_state=123).fit(
                                   pd.DataFrame(X), pd.Series(y.ravel()))
     np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X)))
     return
Example #7
0
    def test_var(self, ):
        # test that the estimator calcualtes var correctly
        config = self._get_base_config()
        config['honest'] = True
        config['max_depth'] = 0
        config['inference'] = True
        config['n_estimators'] = 1000
        config['subforest_size'] = 2
        config['max_samples'] = .5
        config['n_jobs'] = 1
        n_features = 2
        # test api
        n = 100
        random_state = 123
        X, y, truth = self._get_regression_data(n, n_features, random_state)
        forest = RegressionForest(**config).fit(X, y)
        alpha = .1
        mean, var = forest.predict_and_var(X)
        lb = scipy.stats.norm.ppf(alpha / 2,
                                  loc=mean[:, 0],
                                  scale=np.sqrt(var[:, 0, 0])).reshape(-1, 1)
        ub = scipy.stats.norm.ppf(1 - alpha / 2,
                                  loc=mean[:, 0],
                                  scale=np.sqrt(var[:, 0, 0])).reshape(-1, 1)

        np.testing.assert_allclose(var, forest.predict_var(X))
        lbtest, ubtest = forest.predict_interval(X, alpha=alpha)
        np.testing.assert_allclose(lb, lbtest)
        np.testing.assert_allclose(ub, ubtest)
        meantest, lbtest, ubtest = forest.predict(X,
                                                  interval=True,
                                                  alpha=alpha)
        np.testing.assert_allclose(mean, meantest)
        np.testing.assert_allclose(lb, lbtest)
        np.testing.assert_allclose(ub, ubtest)
        np.testing.assert_allclose(np.sqrt(var[:, 0, 0]),
                                   forest.prediction_stderr(X)[:, 0])

        # test accuracy
        for n in [10, 100, 1000, 10000]:
            random_state = 123
            X, y, truth = self._get_regression_data(n, n_features,
                                                    random_state)
            forest = RegressionForest(**config).fit(X, y)
            our_mean, our_var = forest.predict_and_var(X[:1])
            true_mean, true_var = np.mean(y), np.var(y) / y.shape[0]
            np.testing.assert_allclose(our_mean, true_mean, atol=0.05)
            np.testing.assert_allclose(our_var, true_var, atol=0.05, rtol=.1)
        for n, our_thr, true_thr in [(1000, .5, .25), (10000, .05, .05)]:
            random_state = 123
            config['max_depth'] = 1
            X, y, truth = self._get_step_regression_data(
                n, n_features, random_state)
            forest = RegressionForest(**config).fit(X, y)
            posX = X[X[:, 0] > our_thr]
            negX = X[X[:, 0] < -our_thr]
            our_pos_mean, our_pos_var = forest.predict_and_var(posX)
            our_neg_mean, our_neg_var = forest.predict_and_var(negX)
            pos = X[:, 0] > true_thr
            true_pos_mean, true_pos_var = np.mean(
                y[pos]), np.var(y[pos]) / y[pos].shape[0]
            neg = X[:, 0] < -true_thr
            true_neg_mean, true_neg_var = np.mean(
                y[neg]), np.var(y[neg]) / y[neg].shape[0]
            np.testing.assert_allclose(our_pos_mean, true_pos_mean, atol=0.07)
            np.testing.assert_allclose(our_pos_var,
                                       true_pos_var,
                                       atol=0.0,
                                       rtol=.25)
            np.testing.assert_allclose(our_neg_mean, true_neg_mean, atol=0.07)
            np.testing.assert_allclose(our_neg_var,
                                       true_neg_var,
                                       atol=0.0,
                                       rtol=.25)
        return
Example #8
0
    def test_regression_tree_internals(self):
        base_config = self._get_base_config()
        n, n_features = 10, 2
        random_state = 123
        X, y, truth = self._get_regression_data(n, n_features, random_state)
        forest = RegressionForest(**base_config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature,
                                      np.array([0, 0, -2, -2, 0, -2, -2]))
        np.testing.assert_array_equal(
            tree.threshold, np.array([4.5, 2.5, -2, -2, 7.5, -2, -2]))
        np.testing.assert_array_almost_equal(
            tree.value.flatten()[:3],
            np.array([
                np.mean(y),
                np.mean(y[X[:, tree.feature[0]] < tree.threshold[0]]),
                np.mean(y[(X[:, tree.feature[0]] < tree.threshold[0])
                          & (X[:, tree.feature[1]] < tree.threshold[1])])
            ]),
            decimal=5)
        np.testing.assert_array_almost_equal(tree.predict(X), y, decimal=5)
        tree.predict_precond(X)
        tree.predict_jac(X)
        tree.predict_precond_and_jac(X)

        less = X[:, tree.feature[0]] < tree.threshold[0]

        # testing importances
        feature_importances = np.zeros(X.shape[1])
        feature_importances[0] = np.var(y)
        np.testing.assert_array_almost_equal(
            tree.compute_feature_importances(normalize=False),
            feature_importances,
            decimal=5)
        feature_importances = np.zeros(X.shape[1])
        feature_importances[0] = np.var(y) - np.var(y[less])
        np.testing.assert_array_almost_equal(tree.compute_feature_importances(
            normalize=False, max_depth=0),
                                             feature_importances,
                                             decimal=5)
        feature_importances = np.zeros(X.shape[1])
        feature_importances[0] = np.var(y) - np.var(
            y[less]) + .5 * (np.var(y[less]))
        np.testing.assert_array_almost_equal(tree.compute_feature_importances(
            normalize=False, max_depth=1, depth_decay=1.0),
                                             feature_importances,
                                             decimal=5)
        # testing heterogeneity importances
        feature_importances = np.zeros(X.shape[1])
        feature_importances[0] = 5 * 5 * (np.mean(y[less]) -
                                          np.mean(y[~less]))**2 / 100
        np.testing.assert_array_almost_equal(
            tree.compute_feature_heterogeneity_importances(normalize=False,
                                                           max_depth=0),
            feature_importances,
            decimal=5)
        feature_importances[0] += .5 * (2 * 2 * 3 * (1)**2 / 5) / 10
        np.testing.assert_array_almost_equal(
            tree.compute_feature_heterogeneity_importances(normalize=False,
                                                           max_depth=1,
                                                           depth_decay=1.0),
            feature_importances,
            decimal=5)
        feature_importances[0] += .5 * (2 * 2 * 3 * (1)**2 / 5) / 10
        np.testing.assert_array_almost_equal(
            tree.compute_feature_heterogeneity_importances(normalize=False),
            feature_importances,
            decimal=5)

        # Testing that all parameters do what they are supposed to
        config = deepcopy(base_config)
        config['min_samples_leaf'] = 5
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            0,
            -2,
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2]))

        config = deepcopy(base_config)
        config['min_samples_split'] = 11
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([-2]))
        np.testing.assert_array_equal(tree.threshold, np.array([-2]))
        np.testing.assert_array_almost_equal(tree.predict(X),
                                             np.mean(y),
                                             decimal=5)
        np.testing.assert_array_almost_equal(tree.predict_full(X),
                                             np.mean(y),
                                             decimal=5)

        config = deepcopy(base_config)
        config['min_weight_fraction_leaf'] = .5
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            0,
            -2,
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2]))
        # testing predict, apply and decision path
        less = X[:, tree.feature[0]] < tree.threshold[0]
        y_pred = np.zeros((X.shape[0], 1))
        y_pred[less] = np.mean(y[less])
        y_pred[~less] = np.mean(y[~less])
        np.testing.assert_array_almost_equal(tree.predict(X),
                                             y_pred,
                                             decimal=5)
        np.testing.assert_array_almost_equal(tree.predict_full(X),
                                             y_pred,
                                             decimal=5)
        decision_path = np.zeros((X.shape[0], len(tree.feature)))
        decision_path[less, :] = np.array([1, 1, 0])
        decision_path[~less, :] = np.array([1, 0, 1])
        np.testing.assert_array_equal(
            tree.decision_path(X).todense(), decision_path)
        apply = np.zeros(X.shape[0])
        apply[less] = 1
        apply[~less] = 2
        np.testing.assert_array_equal(tree.apply(X), apply)
        feature_importances = np.zeros(X.shape[1])
        feature_importances[0] = 1
        np.testing.assert_array_equal(tree.compute_feature_importances(),
                                      feature_importances)

        config = deepcopy(base_config)
        config['min_balancedness_tol'] = 0.
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            0,
            -2,
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2]))

        config = deepcopy(base_config)
        config['min_balancedness_tol'] = 0.1
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature,
                                      np.array([0, 0, -2, -2, 0, -2, -2]))
        np.testing.assert_array_equal(
            tree.threshold, np.array([4.5, 2.5, -2, -2, 7.5, -2, -2]))

        config = deepcopy(base_config)
        config['max_depth'] = 1
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            0,
            -2,
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2]))

        config = deepcopy(base_config)
        config['min_impurity_decrease'] = 0.9999
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            0,
            -2,
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2]))

        config = deepcopy(base_config)
        config['min_impurity_decrease'] = 1.0001
        forest = RegressionForest(**config).fit(X, y)
        tree = forest[0].tree_
        np.testing.assert_array_equal(tree.feature, np.array([
            -2,
        ]))
        np.testing.assert_array_equal(tree.threshold, np.array([
            -2,
        ]))