def test_projection(self, ): # test the projection functionality of forests # test that the estimator calcualtes var correctly np.set_printoptions(precision=10, suppress=True) config = self._get_base_config() config['honest'] = True config['max_depth'] = 0 config['inference'] = True config['n_estimators'] = 100 config['subforest_size'] = 2 config['max_samples'] = .5 config['n_jobs'] = 1 n_features = 2 # test api n = 100 random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(**config).fit(X, y) mean, var = forest.predict_and_var(X) mean = mean.flatten() var = var.flatten() y = np.hstack([y, y]) truth = np.hstack([truth, truth]) forest = RegressionForest(**config).fit(X, y) projector = np.ones((X.shape[0], 2)) / 2.0 mean_proj, var_proj = forest.predict_projection_and_var(X, projector) np.testing.assert_array_equal(mean_proj, mean) np.testing.assert_array_equal(var_proj, var) np.testing.assert_array_equal( var_proj, forest.predict_projection_var(X, projector)) np.testing.assert_array_equal(mean_proj, forest.predict_projection(X, projector)) return
def test_pickling(self,): n_features = 2 n = 10 random_state = 123 X, y, _ = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(n_estimators=4, warm_start=True, random_state=123).fit(X, y) forest.n_estimators = 8 forest.fit(X, y) pred1 = forest.predict(X) joblib.dump(forest, 'forest.jbl') loaded_forest = joblib.load('forest.jbl') np.testing.assert_equal(loaded_forest.n_estimators, forest.n_estimators) np.testing.assert_allclose(loaded_forest.predict(X), pred1)
def monte_carlo(): n = 5000 d = 5 x_grid = np.linspace(-1, 1, 1000) X_test = np.hstack( [x_grid.reshape(-1, 1), np.random.normal(size=(1000, d - 1))]) coverage = [] exp_dict = {'point': [], 'low': [], 'up': []} for it in range(100): print(it) X = np.random.normal(0, 1, size=(n, d)) y = X[:, 0] + np.random.normal(size=(n, )) est = RegressionForest(n_estimators=1000, verbose=1) est.fit(X, y) point = est.predict(X_test) low, up = est.predict_interval(X_test, alpha=0.05) coverage.append((low <= x_grid) & (x_grid <= up)) exp_dict['point'].append(point) exp_dict['low'].append(low) exp_dict['up'].append(up) if not os.path.exists('figures'): os.makedirs('figures') if not os.path.exists(os.path.join("figures", 'honestforest')): os.makedirs(os.path.join("figures", 'honestforest')) plt.figure() plt.plot(x_grid, np.mean(coverage, axis=0)) plt.savefig('figures/honestforest/coverage.png') plt.figure() plt.plot(x_grid, np.sqrt(np.mean((np.array(exp_dict['point']) - x_grid)**2, axis=0)), label='RMSE') plt.savefig('figures/honestforest/rmse.png') plt.figure() plt.plot(x_grid, np.mean(np.array(exp_dict['up']) - np.array(exp_dict['low']), axis=0), label='length') plt.savefig('figures/honestforest/length.png')
def test_warm_start(self, ): n_features = 2 n = 100 random_state = 123 X, y, _ = self._get_regression_data(n, n_features, random_state) for inference in [True, False]: forest = RegressionForest(n_estimators=4, inference=inference, warm_start=True, random_state=123).fit(X, y) forest.n_estimators = 8 forest.fit(X, y) pred1 = forest.predict(X) inds1 = forest.get_subsample_inds() tree_states1 = [t.random_state for t in forest] forest = RegressionForest(n_estimators=8, inference=inference, warm_start=True, random_state=123).fit(X, y) pred2 = forest.predict(X) inds2 = forest.get_subsample_inds() tree_states2 = [t.random_state for t in forest] np.testing.assert_allclose(pred1, pred2) np.testing.assert_allclose(inds1, inds2) np.testing.assert_allclose(tree_states1, tree_states2) return
def test_raise_exceptions(self, ): # test that we raise errors in mishandled situations. n_features = 2 n = 10 random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=20).fit(X, y[:4]) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=20, subforest_size=3).fit(X, y) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=20, inference=True, max_samples=.6).fit(X, y) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=20, max_samples=20).fit(X, y) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=20, max_samples=1.2).fit(X, y) with np.testing.assert_raises(ValueError): forest = RegressionForest(n_estimators=4, warm_start=True, inference=True).fit(X, y) forest.inference = False forest.n_estimators = 8 forest.fit(X, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, criterion='peculiar').fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, max_depth=-1).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, min_samples_split=-1).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, min_samples_leaf=-1).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, min_weight_fraction_leaf=-1.0).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, min_var_fraction_leaf=-1.0).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, max_features=10).fit(X, y, y) with np.testing.assert_raises(ValueError): forest = CausalForest(n_estimators=4, min_balancedness_tol=.55).fit(X, y, y) return
def test_non_standard_input(self, ): # test that the estimator accepts lists, tuples and pandas data frames n_features = 2 n = 100 random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(n_estimators=20, n_jobs=1, random_state=123).fit(X, y) pred = forest.predict(X) forest = RegressionForest(n_estimators=20, n_jobs=1, random_state=123).fit(tuple(X), tuple(y)) np.testing.assert_allclose(pred, forest.predict(tuple(X))) forest = RegressionForest(n_estimators=20, n_jobs=1, random_state=123).fit(list(X), list(y)) np.testing.assert_allclose(pred, forest.predict(list(X))) forest = RegressionForest(n_estimators=20, n_jobs=1, random_state=123).fit( pd.DataFrame(X), pd.DataFrame(y)) np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X))) forest = RegressionForest(n_estimators=20, n_jobs=1, random_state=123).fit( pd.DataFrame(X), pd.Series(y.ravel())) np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X))) return
def test_var(self, ): # test that the estimator calcualtes var correctly config = self._get_base_config() config['honest'] = True config['max_depth'] = 0 config['inference'] = True config['n_estimators'] = 1000 config['subforest_size'] = 2 config['max_samples'] = .5 config['n_jobs'] = 1 n_features = 2 # test api n = 100 random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(**config).fit(X, y) alpha = .1 mean, var = forest.predict_and_var(X) lb = scipy.stats.norm.ppf(alpha / 2, loc=mean[:, 0], scale=np.sqrt(var[:, 0, 0])).reshape(-1, 1) ub = scipy.stats.norm.ppf(1 - alpha / 2, loc=mean[:, 0], scale=np.sqrt(var[:, 0, 0])).reshape(-1, 1) np.testing.assert_allclose(var, forest.predict_var(X)) lbtest, ubtest = forest.predict_interval(X, alpha=alpha) np.testing.assert_allclose(lb, lbtest) np.testing.assert_allclose(ub, ubtest) meantest, lbtest, ubtest = forest.predict(X, interval=True, alpha=alpha) np.testing.assert_allclose(mean, meantest) np.testing.assert_allclose(lb, lbtest) np.testing.assert_allclose(ub, ubtest) np.testing.assert_allclose(np.sqrt(var[:, 0, 0]), forest.prediction_stderr(X)[:, 0]) # test accuracy for n in [10, 100, 1000, 10000]: random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(**config).fit(X, y) our_mean, our_var = forest.predict_and_var(X[:1]) true_mean, true_var = np.mean(y), np.var(y) / y.shape[0] np.testing.assert_allclose(our_mean, true_mean, atol=0.05) np.testing.assert_allclose(our_var, true_var, atol=0.05, rtol=.1) for n, our_thr, true_thr in [(1000, .5, .25), (10000, .05, .05)]: random_state = 123 config['max_depth'] = 1 X, y, truth = self._get_step_regression_data( n, n_features, random_state) forest = RegressionForest(**config).fit(X, y) posX = X[X[:, 0] > our_thr] negX = X[X[:, 0] < -our_thr] our_pos_mean, our_pos_var = forest.predict_and_var(posX) our_neg_mean, our_neg_var = forest.predict_and_var(negX) pos = X[:, 0] > true_thr true_pos_mean, true_pos_var = np.mean( y[pos]), np.var(y[pos]) / y[pos].shape[0] neg = X[:, 0] < -true_thr true_neg_mean, true_neg_var = np.mean( y[neg]), np.var(y[neg]) / y[neg].shape[0] np.testing.assert_allclose(our_pos_mean, true_pos_mean, atol=0.07) np.testing.assert_allclose(our_pos_var, true_pos_var, atol=0.0, rtol=.25) np.testing.assert_allclose(our_neg_mean, true_neg_mean, atol=0.07) np.testing.assert_allclose(our_neg_var, true_neg_var, atol=0.0, rtol=.25) return
def test_regression_tree_internals(self): base_config = self._get_base_config() n, n_features = 10, 2 random_state = 123 X, y, truth = self._get_regression_data(n, n_features, random_state) forest = RegressionForest(**base_config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([0, 0, -2, -2, 0, -2, -2])) np.testing.assert_array_equal( tree.threshold, np.array([4.5, 2.5, -2, -2, 7.5, -2, -2])) np.testing.assert_array_almost_equal( tree.value.flatten()[:3], np.array([ np.mean(y), np.mean(y[X[:, tree.feature[0]] < tree.threshold[0]]), np.mean(y[(X[:, tree.feature[0]] < tree.threshold[0]) & (X[:, tree.feature[1]] < tree.threshold[1])]) ]), decimal=5) np.testing.assert_array_almost_equal(tree.predict(X), y, decimal=5) tree.predict_precond(X) tree.predict_jac(X) tree.predict_precond_and_jac(X) less = X[:, tree.feature[0]] < tree.threshold[0] # testing importances feature_importances = np.zeros(X.shape[1]) feature_importances[0] = np.var(y) np.testing.assert_array_almost_equal( tree.compute_feature_importances(normalize=False), feature_importances, decimal=5) feature_importances = np.zeros(X.shape[1]) feature_importances[0] = np.var(y) - np.var(y[less]) np.testing.assert_array_almost_equal(tree.compute_feature_importances( normalize=False, max_depth=0), feature_importances, decimal=5) feature_importances = np.zeros(X.shape[1]) feature_importances[0] = np.var(y) - np.var( y[less]) + .5 * (np.var(y[less])) np.testing.assert_array_almost_equal(tree.compute_feature_importances( normalize=False, max_depth=1, depth_decay=1.0), feature_importances, decimal=5) # testing heterogeneity importances feature_importances = np.zeros(X.shape[1]) feature_importances[0] = 5 * 5 * (np.mean(y[less]) - np.mean(y[~less]))**2 / 100 np.testing.assert_array_almost_equal( tree.compute_feature_heterogeneity_importances(normalize=False, max_depth=0), feature_importances, decimal=5) feature_importances[0] += .5 * (2 * 2 * 3 * (1)**2 / 5) / 10 np.testing.assert_array_almost_equal( tree.compute_feature_heterogeneity_importances(normalize=False, max_depth=1, depth_decay=1.0), feature_importances, decimal=5) feature_importances[0] += .5 * (2 * 2 * 3 * (1)**2 / 5) / 10 np.testing.assert_array_almost_equal( tree.compute_feature_heterogeneity_importances(normalize=False), feature_importances, decimal=5) # Testing that all parameters do what they are supposed to config = deepcopy(base_config) config['min_samples_leaf'] = 5 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ 0, -2, -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2])) config = deepcopy(base_config) config['min_samples_split'] = 11 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([-2])) np.testing.assert_array_equal(tree.threshold, np.array([-2])) np.testing.assert_array_almost_equal(tree.predict(X), np.mean(y), decimal=5) np.testing.assert_array_almost_equal(tree.predict_full(X), np.mean(y), decimal=5) config = deepcopy(base_config) config['min_weight_fraction_leaf'] = .5 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ 0, -2, -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2])) # testing predict, apply and decision path less = X[:, tree.feature[0]] < tree.threshold[0] y_pred = np.zeros((X.shape[0], 1)) y_pred[less] = np.mean(y[less]) y_pred[~less] = np.mean(y[~less]) np.testing.assert_array_almost_equal(tree.predict(X), y_pred, decimal=5) np.testing.assert_array_almost_equal(tree.predict_full(X), y_pred, decimal=5) decision_path = np.zeros((X.shape[0], len(tree.feature))) decision_path[less, :] = np.array([1, 1, 0]) decision_path[~less, :] = np.array([1, 0, 1]) np.testing.assert_array_equal( tree.decision_path(X).todense(), decision_path) apply = np.zeros(X.shape[0]) apply[less] = 1 apply[~less] = 2 np.testing.assert_array_equal(tree.apply(X), apply) feature_importances = np.zeros(X.shape[1]) feature_importances[0] = 1 np.testing.assert_array_equal(tree.compute_feature_importances(), feature_importances) config = deepcopy(base_config) config['min_balancedness_tol'] = 0. forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ 0, -2, -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2])) config = deepcopy(base_config) config['min_balancedness_tol'] = 0.1 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([0, 0, -2, -2, 0, -2, -2])) np.testing.assert_array_equal( tree.threshold, np.array([4.5, 2.5, -2, -2, 7.5, -2, -2])) config = deepcopy(base_config) config['max_depth'] = 1 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ 0, -2, -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2])) config = deepcopy(base_config) config['min_impurity_decrease'] = 0.9999 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ 0, -2, -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([4.5, -2, -2])) config = deepcopy(base_config) config['min_impurity_decrease'] = 1.0001 forest = RegressionForest(**config).fit(X, y) tree = forest[0].tree_ np.testing.assert_array_equal(tree.feature, np.array([ -2, ])) np.testing.assert_array_equal(tree.threshold, np.array([ -2, ]))