Beispiel #1
0
def run(size=100, alpha=0.95, beta=2.0, n_trees=50):

    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=size) + np.sin(x)

    model = SklearnModel(n_samples=100,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=True)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    plt.scatter(y_test, y_pred)
    plt.show()

    rmse = np.sqrt(np.sum(np.square(y_test - y_pred)))
    print(rmse)
Beispiel #2
0
def run(alpha, beta, n_trees, size=100):
    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 1.0, size=size) + np.sin(x)
    from bartpy.samplers.unconstrainedtree.treemutation import get_tree_sampler

    model = SklearnModel(n_samples=50,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1,
                         tree_sampler=get_tree_sampler(0.5, 0.5))
    model.fit(X, y)
    plt.plot(y)
    plt.plot(model.predict(X))
    plt.show()
    # plot_tree_depth(model)
    # plot_feature_split_proportions(model)
    # plot_qq(model)
    # null_distr = null_feature_split_proportions_distribution(model, X, y)
    # print(null_distr)
    return model, x, y
Beispiel #3
0
def run(alpha,
        beta,
        n_trees,
        n_regressors,
        n_burn=50,
        n_samples=200,
        n_obsv=1000):
    b_true = np.random.uniform(-2, 2, size=n_regressors)
    x = np.random.normal(0, 1, size=n_obsv * n_regressors).reshape(
        n_obsv, n_regressors)
    x[:50, 1] = 4
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=n_obsv) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))
    model = SklearnModel(n_samples=n_samples,
                         n_burn=n_burn,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1,
                         initializer=None,
                         store_acceptance_trace=False,
                         store_in_sample_predictions=False)
    model.fit(X, y)
    # predictions = model.predict()
    # plt.scatter(y, predictions)
    # plt.show()
    return model, x, y
Beispiel #4
0
def run(alpha, beta, n_trees, size=100):
    import warnings

    warnings.simplefilter("error", UserWarning)
    x = np.linspace(0, 5, size)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=size) + np.sin(x)

    model = SklearnModel(n_samples=500,
                         n_burn=100,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta,
                         n_jobs=1,
                         n_chains=1)
    model.fit(X, y)
    plt.plot(model.data.unnormalized_y)
    plt.plot(model.predict())
    plt.show()
    plot_tree_depth(model)
    plot_feature_split_proportions(model)
    plot_qq(model)
    #null_distr = null_feature_split_proportions_distribution(model, X, y)
    #print(null_distr)
    return model, x, y
Beispiel #5
0
def run(alpha, beta, n_trees):
    x = np.linspace(0, 5, 3000)
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=3000) + np.sin(x)

    model = SklearnModel(n_samples=50, n_burn=50, n_trees=n_trees, alpha=alpha, beta=beta)
    model.fit(X, y)
    plt.plot(model.data.unnormalized_y)
    plt.plot(model.predict(X))
    plt.show()
    plot_tree_depth(model.model_samples)
    plot_feature_split_proportions(model.model_samples)
    plot_qq(model)
    #null_distr = null_feature_split_proportions_distribution(model, X, y)
    #print(null_distr)
    return model, x, y
Beispiel #6
0
def run(alpha, beta, n_trees, n_regressors):
    b_true = np.random.uniform(-2, 2, size=n_regressors)
    x = np.random.normal(0, 1, size=10000 * n_regressors).reshape(
        10000, n_regressors)
    x[:5000, 1] = 4
    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=10000) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))
    model = SklearnModel(n_samples=200,
                         n_burn=50,
                         n_trees=n_trees,
                         alpha=alpha,
                         beta=beta)
    model.fit(X, y)
    predictions = model.predict()
    plt.scatter(y, predictions)
    plt.show()
    return model, x, y
Beispiel #7
0
def run(n: int = 10000, k_true: int = 3, k_null: int = 2):
    b_true = np.random.uniform(2, 0.1, size=k_true)
    b_true = np.array(list(b_true) + [0.0] * k_null)
    print(b_true)
    x = np.random.normal(0, 1, size=n * (k_true + k_null)).reshape(
        n, (k_true + k_null))

    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=n) + np.array(
        X.multiply(b_true, axis=1).sum(axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=True)

    model = SklearnModel(n_samples=50,
                         n_burn=50,
                         n_trees=20,
                         store_in_sample_predictions=False,
                         n_jobs=3,
                         n_chains=1)

    pipeline = make_pipeline(
        SelectNullDistributionThreshold(model, n_permutations=20), model)
    pipeline_model = pipeline.fit(X_train, y_train)
    print(
        "Thresholds", pipeline_model.
        named_steps["selectnulldistributionthreshold"].thresholds)
    print(
        "Feature Proportions", pipeline_model.
        named_steps["selectnulldistributionthreshold"].feature_proportions)
    print(
        "Is Kept", pipeline_model.
        named_steps["selectnulldistributionthreshold"]._get_support_mask())
    pipeline_model.named_steps["selectnulldistributionthreshold"].plot()
Beispiel #8
0
#gs_xgb = GridSearchCV(estimator=pipe_bart,
#                     param_grid=params_bart,
#                    cv=loo)
# Fit grid search
#gs_xgb.fit(X_train, y_train.ravel())
# Best params
#print('Best params: %s' % gs_xgb.best_params_)
# Best training data accuracy
#print('Best training score: %.3f' % gs_xgb.best_score_)
# Predict on test data with best params
for cutoff in [0.1, 0.5]:
    for n_chains in [3, 4, 5]:
        for n_trees in [25, 50, 100]:
            for n_burn in [100, 200, 300]:
                for n_samples in [100, 50, 200]:
                    for sigma_b in [0.0001, 0.01, 0.001]:
                        for sigma_a in [0.0001, 0.01, 0.001]:
                            gs_xgb = SklearnModel(sigma_a=sigma_a,
                                                  sigma_b=sigma_b,
                                                  n_samples=n_samples).fit(
                                                      X_train, y_train)
                            y_pred = gs_xgb.predict(X_test)
                            # Test data accuracy of model with best params
                            #print('Test set score score for best params: %.3f ' % mean_squared_error(y_test, y_pred))
                            print(
                                'Test set score score for best params: %.3f ' %
                                f1_score(y_test, y_pred > cutoff))
                            print("n samples", {n_samples}, "\n b ", sigma_b,
                                  "\na ", sigma_a, "\ncutoff ", cutoff)
Beispiel #9
0
        results['t'].append(np.mean((cate_t - sim_test['tau'])**2))
        print('Cate_s')
        cate_s = s_learner(data, x_vars, x_test, base_learner)
        results['s'].append(np.mean((cate_s - sim_test['tau'])**2))
        print('Cate_x')
        cate_x = x_learner(data, x_vars, x_test, base_learner,
                           base_learner_class)
        results['x'].append(np.mean((cate_x - sim_test['tau'])**2))
    return results


if __name__ == "__main__":
    base_learner_rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    base_learner_gb = GradientBoostingRegressor(random_state=42)
    base_learner_bart = SklearnModel(n_trees=200,
                                     n_burn=1200,
                                     alpha=0.5,
                                     beta=1)
    gb = GradientBoostingClassifier(random_state=42)
    rf = RandomForestClassifier(n_estimators=500, random_state=42)
    lr = LogisticRegression()
    simulations = ['sim' + str(i) for i in range(1, 7)]
    for sim in simulations:
        print('Bart', sim)
        results = get_results(base_learner_gb, lr, sim)
        results_pd = pd.DataFrame(results)
        # TODO: change the save directory
        results_pd.to_csv('../Bart_{}.csv'.format(sim), index=None)
        plt.plot([i / 1000 for i in results['n']],
                 results['t'],
                 c='gray',
                 marker='x')
Beispiel #10
0
Y0_test = Y0_test.reshape([
    -1,
])
Y1_test = Y1_test.reshape([
    -1,
])

#----------------------------------------------------------------
#
#     MODELS
#
#----------------------------------------------------------------

n_trees = 100  # default is 200 trees

model0 = SklearnModel(n_trees=n_trees)  # Use default parameters
model0.fit(X0, Y0)  # Fit the model
model1 = SklearnModel(n_trees=n_trees)  # Use default parameters
model1.fit(X1, Y1)  # Fit the model

tau_hat = model1.predict(X) - model0.predict(X)
# tau_hat_val = model1.predict(X_val) - model0.predict(X_val)
# tau_hat_test = model1.predict(X_test) - model0.predict(X_test)

pehe_ = eval_pehe(tau_hat, Tau)

tau_hat_val = model1.predict(X_val) - model0.predict(X_val)
tau_hat_test = model1.predict(X_test) - model0.predict(X_test)

pehe_val = eval_pehe(tau_hat_val, Tau_val)
pehe_test = eval_pehe(tau_hat_test, Tau_test)