Ejemplo n.º 1
0
def original_model_rmse(model: SklearnModel, X: Union[pd.DataFrame,
                                                      np.ndarray],
                        y: np.ndarray, n_k_fold_splits: int) -> List[float]:
    """
    Calculate the RMSE of the original model
    Used as a benchmark to compare against the null

    Parameters
    ----------
    model: SklearnModel
    X: np.ndarray
    y: np.ndarray
    n_k_fold_splits: int

    Returns
    -------
    List[float]
        List of the out of sample RMSEs for each fold of the covariate matrix
    """
    kf = KFold(n_k_fold_splits, shuffle=True)

    base_line_rmses = []

    for train_index, test_index in kf.split(X):
        model = deepcopy(model)
        model.fit(X[train_index], y[train_index])
        base_line_rmses.append(model.rmse(X[test_index], y[test_index]))

    return base_line_rmses
Ejemplo n.º 2
0
def run(alpha,
        beta,
        n_trees,
        n_regressors,
        n_burn=50,
        n_samples=200,
        n_obsv=1000):
    b_true = np.random.uniform(-2, 2, size=n_regressors)
    x = np.random.normal(0, 1, size=n_obsv * n_regressors).reshape(
        n_obsv, n_regressors)
    x[:50, 1] = 4
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=n_obsv) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))
    model = SklearnModel(n_samples=n_samples,
                         n_burn=n_burn,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1,
                         initializer=None,
                         store_acceptance_trace=False,
                         store_in_sample_predictions=False)
    model.fit(X, y)
    # predictions = model.predict()
    # plt.scatter(y, predictions)
    # plt.show()
    return model, x, y
Ejemplo n.º 3
0
def run(alpha, beta, n_trees, size=100):
    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 1.0, size=size) + np.sin(x)
    from bartpy.samplers.unconstrainedtree.treemutation import get_tree_sampler

    model = SklearnModel(n_samples=50,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1,
                         tree_sampler=get_tree_sampler(0.5, 0.5))
    model.fit(X, y)
    plt.plot(y)
    plt.plot(model.predict(X))
    plt.show()
    # plot_tree_depth(model)
    # plot_feature_split_proportions(model)
    # plot_qq(model)
    # null_distr = null_feature_split_proportions_distribution(model, X, y)
    # print(null_distr)
    return model, x, y
Ejemplo n.º 4
0
def run(alpha, beta, n_trees, size=100):
    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=size) + np.sin(x)

    model = SklearnModel(n_samples=500,
                         n_burn=100,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1)
    model.fit(X, y)
    plt.plot(model.data.unnormalized_y)
    plt.plot(model.predict())
    plt.show()
    plot_tree_depth(model)
    plot_feature_split_proportions(model)
    plot_qq(model)
    #null_distr = null_feature_split_proportions_distribution(model, X, y)
    #print(null_distr)
    return model, x, y
Ejemplo n.º 5
0
def run(size=100, alpha=0.95, beta=2.0, n_trees=50):

    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=size) + np.sin(x)

    model = SklearnModel(n_samples=100,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=True)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    plt.scatter(y_test, y_pred)
    plt.show()

    rmse = np.sqrt(np.sum(np.square(y_test - y_pred)))
    print(rmse)
Ejemplo n.º 6
0
def plot_homoskedasity_diagnostics(model: SklearnModel, ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1, figsize=(5, 5))
    sns.regplot(model.predict(model.data.X.values), model.residuals(model.data.X.values), ax=ax)
    ax.set_title("Fitted Values V Residuals")
    ax.set_xlabel("Fitted Value")
    ax.set_ylabel("Residual")
    return ax
Ejemplo n.º 7
0
def plot_qq(model: SklearnModel, ax=None) -> None:
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    residuals = model.residuals(model.data.X)
    sm.qqplot(residuals, fit=True, line="45", ax=ax)
    ax.set_title("QQ plot")
    return ax
Ejemplo n.º 8
0
def run(alpha, beta, n_trees):
    x = np.linspace(0, 5, 3000)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=3000) + np.sin(x)

    model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta)
    model.fit(X, y)
    plt.plot(model.data.unnormalized_y)
    plt.plot(model.predict(X))
    plt.show()
    plot_tree_depth(model.model_samples)
    plot_feature_split_proportions(model.model_samples)
    plot_qq(model)
    #null_distr = null_feature_split_proportions_distribution(model, X, y)
    #print(null_distr)
    return model, x, y
Ejemplo n.º 9
0
def run(alpha, beta, n_trees, n_regressors):
    b_true = np.random.uniform(-2, 2, size=n_regressors)
    x = np.random.normal(0, 1, size=10000 * n_regressors).reshape(
        10000, n_regressors)
    x[:5000, 1] = 4
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=10000) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))
    model = SklearnModel(n_samples=200,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta)
    model.fit(X, y)
    predictions = model.predict()
    plt.scatter(y, predictions)
    plt.show()
    return model, x, y
Ejemplo n.º 10
0
def convert_chains_models(model: SklearnModel, X_s: List[np.ndarray],
                          y_s: List[np.ndarray],
                          chains: List[Chain]) -> List[SklearnModel]:
    n_chains = model.n_chains

    grouped_chains = []
    for i, x in enumerate(chains):
        if i % n_chains == 0:
            grouped_chains.append([])
        grouped_chains[-1].append(x)

    return [
        model.from_extract(chain, x, y)
        for (chain, (x, y)) in zip(grouped_chains, zip(X_s, y_s))
    ]
Ejemplo n.º 11
0
def run(n: int = 10000, k_true: int = 3, k_null: int = 2):
    b_true = np.random.uniform(2, 0.1, size=k_true)
    b_true = np.array(list(b_true) + [0.0] * k_null)
    print(b_true)
    x = np.random.normal(0, 1, size=n * (k_true + k_null)).reshape(
        n, (k_true + k_null))

    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=n) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=True)

    model = SklearnModel(n_samples=50,
                         n_burn=50,
                         n_trees=20,
                         store_in_sample_predictions=False,
                         n_jobs=3,
                         n_chains=1)

    pipeline = make_pipeline(
        SelectNullDistributionThreshold(model, n_permutations=20), model)
    pipeline_model = pipeline.fit(X_train, y_train)
    print(
        "Thresholds", pipeline_model.
        named_steps["selectnulldistributionthreshold"].thresholds)
    print(
        "Feature Proportions", pipeline_model.
        named_steps["selectnulldistributionthreshold"].feature_proportions)
    print(
        "Is Kept", pipeline_model.
        named_steps["selectnulldistributionthreshold"]._get_support_mask())
    pipeline_model.named_steps["selectnulldistributionthreshold"].plot()
Ejemplo n.º 12
0
        results['t'].append(np.mean((cate_t - sim_test['tau'])**2))
        print('Cate_s')
        cate_s = s_learner(data, x_vars, x_test, base_learner)
        results['s'].append(np.mean((cate_s - sim_test['tau'])**2))
        print('Cate_x')
        cate_x = x_learner(data, x_vars, x_test, base_learner,
                           base_learner_class)
        results['x'].append(np.mean((cate_x - sim_test['tau'])**2))
    return results


if __name__ == "__main__":
    base_learner_rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    base_learner_gb = GradientBoostingRegressor(random_state=42)
    base_learner_bart = SklearnModel(n_trees=200,
                                     n_burn=1200,
                                     alpha=0.5,
                                     beta=1)
    gb = GradientBoostingClassifier(random_state=42)
    rf = RandomForestClassifier(n_estimators=500, random_state=42)
    lr = LogisticRegression()
    simulations = ['sim' + str(i) for i in range(1, 7)]
    for sim in simulations:
        print('Bart', sim)
        results = get_results(base_learner_gb, lr, sim)
        results_pd = pd.DataFrame(results)
        # TODO: change the save directory
        results_pd.to_csv('../Bart_{}.csv'.format(sim), index=None)
        plt.plot([i / 1000 for i in results['n']],
                 results['t'],
                 c='gray',
                 marker='x')
Ejemplo n.º 13
0
def plot_qq(model: SklearnModel) -> None:
    residuals = model.residuals()
    fig = sm.qqplot(residuals, fit=True, line="45")
    plt.show()
Ejemplo n.º 14
0
 def predict(self, X: np.ndarray = None) -> np.ndarray:
     if X is None:
         X = self.data.X
     sm_prediction = self.base_estimator.predict(X)
     bart_prediction = SklearnModel.predict(self, X)
     return sm_prediction + bart_prediction
Ejemplo n.º 15
0
 def fit(self, X: np.ndarray, y: np.ndarray) -> 'ResidualBART':
     self.base_estimator.fit(X, y)
     SklearnModel.fit(self, X, y - self.base_estimator.predict(X))
     return self
Ejemplo n.º 16
0
#gs_xgb = GridSearchCV(estimator=pipe_bart,
#                     param_grid=params_bart,
#                    cv=loo)
# Fit grid search
#gs_xgb.fit(X_train, y_train.ravel())
# Best params
#print('Best params: %s' % gs_xgb.best_params_)
# Best training data accuracy
#print('Best training score: %.3f' % gs_xgb.best_score_)
# Predict on test data with best params
for cutoff in [0.1, 0.5]:
    for n_chains in [3, 4, 5]:
        for n_trees in [25, 50, 100]:
            for n_burn in [100, 200, 300]:
                for n_samples in [100, 50, 200]:
                    for sigma_b in [0.0001, 0.01, 0.001]:
                        for sigma_a in [0.0001, 0.01, 0.001]:
                            gs_xgb = SklearnModel(sigma_a=sigma_a,
                                                  sigma_b=sigma_b,
                                                  n_samples=n_samples).fit(
                                                      X_train, y_train)
                            y_pred = gs_xgb.predict(X_test)
                            # Test data accuracy of model with best params
                            #print('Test set score score for best params: %.3f ' % mean_squared_error(y_test, y_pred))
                            print(
                                'Test set score score for best params: %.3f ' %
                                f1_score(y_test, y_pred > cutoff))
                            print("n samples", {n_samples}, "\n b ", sigma_b,
                                  "\na ", sigma_a, "\ncutoff ", cutoff)
Ejemplo n.º 17
0
 def fit(self, X: np.ndarray, y: np.ndarray) -> 'OLS':
     self.stat_model_fit = self.stat_model(y, X).fit()
     SklearnModel.fit(self, X, self.stat_model_fit.resid)
     return self
Ejemplo n.º 18
0
 def fit(self, X: pd.DataFrame, y: np.ndarray) -> 'OLS':
     self.stat_model_fit = self.stat_model(y, X).fit()
     print(self.stat_model_fit.resid)
     SklearnModel.fit(self, X, self.stat_model_fit.resid)
     return self
Ejemplo n.º 19
0
def plot_residuals(model: SklearnModel):
    plt.plot(model.data.unnormalized_y - model.predict())
    plt.show()
Ejemplo n.º 20
0
def plot_modelled_against_actual(model: SklearnModel):
    plt.plot(model.data.unnormalized_y)
    plt.plot(model.predict())
    plt.show()
Ejemplo n.º 21
0
Y0_test = Y0_test.reshape([
    -1,
])
Y1_test = Y1_test.reshape([
    -1,
])

#----------------------------------------------------------------
#
#     MODELS
#
#----------------------------------------------------------------

n_trees = 100  # default is 200 trees

model0 = SklearnModel(n_trees=n_trees)  # Use default parameters
model0.fit(X0, Y0)  # Fit the model
model1 = SklearnModel(n_trees=n_trees)  # Use default parameters
model1.fit(X1, Y1)  # Fit the model

tau_hat = model1.predict(X) - model0.predict(X)
# tau_hat_val = model1.predict(X_val) - model0.predict(X_val)
# tau_hat_test = model1.predict(X_test) - model0.predict(X_test)

pehe_ = eval_pehe(tau_hat, Tau)

tau_hat_val = model1.predict(X_val) - model0.predict(X_val)
tau_hat_test = model1.predict(X_test) - model0.predict(X_test)

pehe_val = eval_pehe(tau_hat_val, Tau_val)
pehe_test = eval_pehe(tau_hat_test, Tau_test)
Ejemplo n.º 22
0
 def predict(self, X: np.ndarray = None):
     if X is None:
         X = self.data.X
     sm_prediction = self.stat_model_fit.predict(X)
     bart_prediction = SklearnModel.predict(self, X)
     return sm_prediction + bart_prediction