def original_model_rmse(model: SklearnModel, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray, n_k_fold_splits: int) -> List[float]: """ Calculate the RMSE of the original model Used as a benchmark to compare against the null Parameters ---------- model: SklearnModel X: np.ndarray y: np.ndarray n_k_fold_splits: int Returns ------- List[float] List of the out of sample RMSEs for each fold of the covariate matrix """ kf = KFold(n_k_fold_splits, shuffle=True) base_line_rmses = [] for train_index, test_index in kf.split(X): model = deepcopy(model) model.fit(X[train_index], y[train_index]) base_line_rmses.append(model.rmse(X[test_index], y[test_index])) return base_line_rmses
def run(alpha, beta, n_trees, n_regressors, n_burn=50, n_samples=200, n_obsv=1000): b_true = np.random.uniform(-2, 2, size=n_regressors) x = np.random.normal(0, 1, size=n_obsv * n_regressors).reshape( n_obsv, n_regressors) x[:50, 1] = 4 X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=n_obsv) + np.array( X.multiply(b_true, axis=1).sum(axis=1)) model = SklearnModel(n_samples=n_samples, n_burn=n_burn, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1, initializer=None, store_acceptance_trace=False, store_in_sample_predictions=False) model.fit(X, y) # predictions = model.predict() # plt.scatter(y, predictions) # plt.show() return model, x, y
def run(alpha, beta, n_trees, size=100): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 1.0, size=size) + np.sin(x) from bartpy.samplers.unconstrainedtree.treemutation import get_tree_sampler model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1, tree_sampler=get_tree_sampler(0.5, 0.5)) model.fit(X, y) plt.plot(y) plt.plot(model.predict(X)) plt.show() # plot_tree_depth(model) # plot_feature_split_proportions(model) # plot_qq(model) # null_distr = null_feature_split_proportions_distribution(model, X, y) # print(null_distr) return model, x, y
def run(alpha, beta, n_trees, size=100): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=size) + np.sin(x) model = SklearnModel(n_samples=500, n_burn=100, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1) model.fit(X, y) plt.plot(model.data.unnormalized_y) plt.plot(model.predict()) plt.show() plot_tree_depth(model) plot_feature_split_proportions(model) plot_qq(model) #null_distr = null_feature_split_proportions_distribution(model, X, y) #print(null_distr) return model, x, y
def run(size=100, alpha=0.95, beta=2.0, n_trees=50): import warnings warnings.simplefilter("error", UserWarning) x = np.linspace(0, 5, size) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=size) + np.sin(x) model = SklearnModel(n_samples=100, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta, n_jobs=1, n_chains=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True) model.fit(X_train, y_train) y_pred = model.predict(X_test) plt.scatter(y_test, y_pred) plt.show() rmse = np.sqrt(np.sum(np.square(y_test - y_pred))) print(rmse)
def plot_homoskedasity_diagnostics(model: SklearnModel, ax=None): if ax is None: _, ax = plt.subplots(1, 1, figsize=(5, 5)) sns.regplot(model.predict(model.data.X.values), model.residuals(model.data.X.values), ax=ax) ax.set_title("Fitted Values V Residuals") ax.set_xlabel("Fitted Value") ax.set_ylabel("Residual") return ax
def plot_qq(model: SklearnModel, ax=None) -> None: if ax is None: fig, ax = plt.subplots(1, 1) residuals = model.residuals(model.data.X) sm.qqplot(residuals, fit=True, line="45", ax=ax) ax.set_title("QQ plot") return ax
def run(alpha, beta, n_trees): x = np.linspace(0, 5, 3000) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=3000) + np.sin(x) model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta) model.fit(X, y) plt.plot(model.data.unnormalized_y) plt.plot(model.predict(X)) plt.show() plot_tree_depth(model.model_samples) plot_feature_split_proportions(model.model_samples) plot_qq(model) #null_distr = null_feature_split_proportions_distribution(model, X, y) #print(null_distr) return model, x, y
def run(alpha, beta, n_trees, n_regressors): b_true = np.random.uniform(-2, 2, size=n_regressors) x = np.random.normal(0, 1, size=10000 * n_regressors).reshape( 10000, n_regressors) x[:5000, 1] = 4 X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=10000) + np.array( X.multiply(b_true, axis=1).sum(axis=1)) model = SklearnModel(n_samples=200, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta) model.fit(X, y) predictions = model.predict() plt.scatter(y, predictions) plt.show() return model, x, y
def convert_chains_models(model: SklearnModel, X_s: List[np.ndarray], y_s: List[np.ndarray], chains: List[Chain]) -> List[SklearnModel]: n_chains = model.n_chains grouped_chains = [] for i, x in enumerate(chains): if i % n_chains == 0: grouped_chains.append([]) grouped_chains[-1].append(x) return [ model.from_extract(chain, x, y) for (chain, (x, y)) in zip(grouped_chains, zip(X_s, y_s)) ]
def run(n: int = 10000, k_true: int = 3, k_null: int = 2): b_true = np.random.uniform(2, 0.1, size=k_true) b_true = np.array(list(b_true) + [0.0] * k_null) print(b_true) x = np.random.normal(0, 1, size=n * (k_true + k_null)).reshape( n, (k_true + k_null)) X = pd.DataFrame(x) y = np.random.normal(0, 0.1, size=n) + np.array( X.multiply(b_true, axis=1).sum(axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True) model = SklearnModel(n_samples=50, n_burn=50, n_trees=20, store_in_sample_predictions=False, n_jobs=3, n_chains=1) pipeline = make_pipeline( SelectNullDistributionThreshold(model, n_permutations=20), model) pipeline_model = pipeline.fit(X_train, y_train) print( "Thresholds", pipeline_model. named_steps["selectnulldistributionthreshold"].thresholds) print( "Feature Proportions", pipeline_model. named_steps["selectnulldistributionthreshold"].feature_proportions) print( "Is Kept", pipeline_model. named_steps["selectnulldistributionthreshold"]._get_support_mask()) pipeline_model.named_steps["selectnulldistributionthreshold"].plot()
results['t'].append(np.mean((cate_t - sim_test['tau'])**2)) print('Cate_s') cate_s = s_learner(data, x_vars, x_test, base_learner) results['s'].append(np.mean((cate_s - sim_test['tau'])**2)) print('Cate_x') cate_x = x_learner(data, x_vars, x_test, base_learner, base_learner_class) results['x'].append(np.mean((cate_x - sim_test['tau'])**2)) return results if __name__ == "__main__": base_learner_rf = RandomForestRegressor(n_estimators=1000, random_state=42) base_learner_gb = GradientBoostingRegressor(random_state=42) base_learner_bart = SklearnModel(n_trees=200, n_burn=1200, alpha=0.5, beta=1) gb = GradientBoostingClassifier(random_state=42) rf = RandomForestClassifier(n_estimators=500, random_state=42) lr = LogisticRegression() simulations = ['sim' + str(i) for i in range(1, 7)] for sim in simulations: print('Bart', sim) results = get_results(base_learner_gb, lr, sim) results_pd = pd.DataFrame(results) # TODO: change the save directory results_pd.to_csv('../Bart_{}.csv'.format(sim), index=None) plt.plot([i / 1000 for i in results['n']], results['t'], c='gray', marker='x')
def plot_qq(model: SklearnModel) -> None: residuals = model.residuals() fig = sm.qqplot(residuals, fit=True, line="45") plt.show()
def predict(self, X: np.ndarray = None) -> np.ndarray: if X is None: X = self.data.X sm_prediction = self.base_estimator.predict(X) bart_prediction = SklearnModel.predict(self, X) return sm_prediction + bart_prediction
def fit(self, X: np.ndarray, y: np.ndarray) -> 'ResidualBART': self.base_estimator.fit(X, y) SklearnModel.fit(self, X, y - self.base_estimator.predict(X)) return self
#gs_xgb = GridSearchCV(estimator=pipe_bart, # param_grid=params_bart, # cv=loo) # Fit grid search #gs_xgb.fit(X_train, y_train.ravel()) # Best params #print('Best params: %s' % gs_xgb.best_params_) # Best training data accuracy #print('Best training score: %.3f' % gs_xgb.best_score_) # Predict on test data with best params for cutoff in [0.1, 0.5]: for n_chains in [3, 4, 5]: for n_trees in [25, 50, 100]: for n_burn in [100, 200, 300]: for n_samples in [100, 50, 200]: for sigma_b in [0.0001, 0.01, 0.001]: for sigma_a in [0.0001, 0.01, 0.001]: gs_xgb = SklearnModel(sigma_a=sigma_a, sigma_b=sigma_b, n_samples=n_samples).fit( X_train, y_train) y_pred = gs_xgb.predict(X_test) # Test data accuracy of model with best params #print('Test set score score for best params: %.3f ' % mean_squared_error(y_test, y_pred)) print( 'Test set score score for best params: %.3f ' % f1_score(y_test, y_pred > cutoff)) print("n samples", {n_samples}, "\n b ", sigma_b, "\na ", sigma_a, "\ncutoff ", cutoff)
def fit(self, X: np.ndarray, y: np.ndarray) -> 'OLS': self.stat_model_fit = self.stat_model(y, X).fit() SklearnModel.fit(self, X, self.stat_model_fit.resid) return self
def fit(self, X: pd.DataFrame, y: np.ndarray) -> 'OLS': self.stat_model_fit = self.stat_model(y, X).fit() print(self.stat_model_fit.resid) SklearnModel.fit(self, X, self.stat_model_fit.resid) return self
def plot_residuals(model: SklearnModel): plt.plot(model.data.unnormalized_y - model.predict()) plt.show()
def plot_modelled_against_actual(model: SklearnModel): plt.plot(model.data.unnormalized_y) plt.plot(model.predict()) plt.show()
Y0_test = Y0_test.reshape([ -1, ]) Y1_test = Y1_test.reshape([ -1, ]) #---------------------------------------------------------------- # # MODELS # #---------------------------------------------------------------- n_trees = 100 # default is 200 trees model0 = SklearnModel(n_trees=n_trees) # Use default parameters model0.fit(X0, Y0) # Fit the model model1 = SklearnModel(n_trees=n_trees) # Use default parameters model1.fit(X1, Y1) # Fit the model tau_hat = model1.predict(X) - model0.predict(X) # tau_hat_val = model1.predict(X_val) - model0.predict(X_val) # tau_hat_test = model1.predict(X_test) - model0.predict(X_test) pehe_ = eval_pehe(tau_hat, Tau) tau_hat_val = model1.predict(X_val) - model0.predict(X_val) tau_hat_test = model1.predict(X_test) - model0.predict(X_test) pehe_val = eval_pehe(tau_hat_val, Tau_val) pehe_test = eval_pehe(tau_hat_test, Tau_test)
def predict(self, X: np.ndarray = None): if X is None: X = self.data.X sm_prediction = self.stat_model_fit.predict(X) bart_prediction = SklearnModel.predict(self, X) return sm_prediction + bart_prediction