def run(size=100, alpha=0.95, beta=2.0, n_trees=50): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=size) + np.sin(x) model = SklearnModel(n_samples=100, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True) model.fit(X_train, y_train) y_pred = model.predict(X_test) plt.scatter(y_test, y_pred) plt.show() rmse = np.sqrt(np.sum(np.square(y_test - y_pred))) print(rmse)
def run(alpha, beta, n_trees, size=100): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=size) + np.sin(x) model = SklearnModel(n_samples=500, n_burn=100, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1) model.fit(X, y) plt.plot(model.data.unnormalized_y) plt.plot(model.predict()) plt.show() plot_tree_depth(model) plot_feature_split_proportions(model) plot_qq(model) #null_distr = null_feature_split_proportions_distribution(model, X, y) #print(null_distr) return model, x, y
def run(alpha, beta, n_trees, size=100): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 1.0, size=size) + np.sin(x) from bartpy.samplers.unconstrainedtree.treemutation import get_tree_sampler model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1, tree_sampler=get_tree_sampler(0.5, 0.5)) model.fit(X, y) plt.plot(y) plt.plot(model.predict(X)) plt.show() # plot_tree_depth(model) # plot_feature_split_proportions(model) # plot_qq(model) # null_distr = null_feature_split_proportions_distribution(model, X, y) # print(null_distr) return model, x, y
def plot_homoskedasity_diagnostics(model: SklearnModel, ax=None): if ax is None: _, ax = plt.subplots(1, 1, figsize=(5, 5)) sns.regplot(model.predict(model.data.X.values), model.residuals(model.data.X.values), ax=ax) ax.set_title("Fitted Values V Residuals") ax.set_xlabel("Fitted Value") ax.set_ylabel("Residual") return ax
def run(alpha, beta, n_trees): x = np.linspace(0, 5, 3000) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=3000) + np.sin(x) model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta) model.fit(X, y) plt.plot(model.data.unnormalized_y) plt.plot(model.predict(X)) plt.show() plot_tree_depth(model.model_samples) plot_feature_split_proportions(model.model_samples) plot_qq(model) #null_distr = null_feature_split_proportions_distribution(model, X, y) #print(null_distr) return model, x, y
def run(alpha, beta, n_trees, n_regressors): b_true = np.random.uniform(-2, 2, size=n_regressors) x = np.random.normal(0, 1, size=10000 * n_regressors).reshape( 10000, n_regressors) x[:5000, 1] = 4 X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=10000) + np.array( X.multiply(b_true, axis=1).sum(axis=1)) model = SklearnModel(n_samples=200, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta) model.fit(X, y) predictions = model.predict() plt.scatter(y, predictions) plt.show() return model, x, y
def predict(self, X: np.ndarray = None) -> np.ndarray: if X is None: X = self.data.X sm_prediction = self.base_estimator.predict(X) bart_prediction = SklearnModel.predict(self, X) return sm_prediction + bart_prediction
#gs_xgb = GridSearchCV(estimator=pipe_bart, # param_grid=params_bart, # cv=loo) # Fit grid search #gs_xgb.fit(X_train, y_train.ravel()) # Best params #print('Best params: %s' % gs_xgb.best_params_) # Best training data accuracy #print('Best training score: %.3f' % gs_xgb.best_score_) # Predict on test data with best params for cutoff in [0.1, 0.5]: for n_chains in [3, 4, 5]: for n_trees in [25, 50, 100]: for n_burn in [100, 200, 300]: for n_samples in [100, 50, 200]: for sigma_b in [0.0001, 0.01, 0.001]: for sigma_a in [0.0001, 0.01, 0.001]: gs_xgb = SklearnModel(sigma_a=sigma_a, sigma_b=sigma_b, n_samples=n_samples).fit( X_train, y_train) y_pred = gs_xgb.predict(X_test) # Test data accuracy of model with best params #print('Test set score score for best params: %.3f ' % mean_squared_error(y_test, y_pred)) print( 'Test set score score for best params: %.3f ' % f1_score(y_test, y_pred > cutoff)) print("n samples", {n_samples}, "\n b ", sigma_b, "\na ", sigma_a, "\ncutoff ", cutoff)
def plot_residuals(model: SklearnModel): plt.plot(model.data.unnormalized_y - model.predict()) plt.show()
def plot_modelled_against_actual(model: SklearnModel): plt.plot(model.data.unnormalized_y) plt.plot(model.predict()) plt.show()
]) #---------------------------------------------------------------- # # MODELS # #---------------------------------------------------------------- n_trees = 100 # default is 200 trees model0 = SklearnModel(n_trees=n_trees) # Use default parameters model0.fit(X0, Y0) # Fit the model model1 = SklearnModel(n_trees=n_trees) # Use default parameters model1.fit(X1, Y1) # Fit the model tau_hat = model1.predict(X) - model0.predict(X) # tau_hat_val = model1.predict(X_val) - model0.predict(X_val) # tau_hat_test = model1.predict(X_test) - model0.predict(X_test) pehe_ = eval_pehe(tau_hat, Tau) tau_hat_val = model1.predict(X_val) - model0.predict(X_val) tau_hat_test = model1.predict(X_test) - model0.predict(X_test) pehe_val = eval_pehe(tau_hat_val, Tau_val) pehe_test = eval_pehe(tau_hat_test, Tau_test) print(pehe_) print(pehe_val) print(pehe_test)
def predict(self, X: np.ndarray = None): if X is None: X = self.data.X sm_prediction = self.stat_model_fit.predict(X) bart_prediction = SklearnModel.predict(self, X) return sm_prediction + bart_prediction