Ejemplo n.º 1
0
 def test_classification(self):
     data, target = load_breast_cancer(True)
     x_train, x_test, y_train, y_test = train_test_split(data,
                                                         target,
                                                         test_size=0.2,
                                                         random_state=42)
     ngb = NGBoost(Base=default_tree_learner,
                   Dist=Bernoulli,
                   Score=MLE,
                   verbose=False)
     ngb.fit(x_train, y_train)
     preds = ngb.pred_dist(x_test)
     score = roc_auc_score(y_test, preds.prob)
     assert score >= 0.95
Ejemplo n.º 2
0
 def test_regression(self):
     data, target = load_boston(True)
     x_train, x_test, y_train, y_test = train_test_split(data,
                                                         target,
                                                         test_size=0.2,
                                                         random_state=42)
     ngb = NGBoost(Base=default_tree_learner,
                   Dist=Normal,
                   Score=MLE,
                   natural_gradient=True,
                   verbose=False)
     ngb.fit(x_train, y_train)
     preds = ngb.predict(x_test)
     score = mean_squared_error(y_test, preds)
     assert score <= 8.0
Ejemplo n.º 3
0
def ngb_impute(estimator, X, Y):
    base_name_to_learner = {
        "tree": default_tree_learner,
        "linear": default_linear_learner,
    }

    ngb = NGBoost(Dist=estimator,
                  n_estimators=200,
                  learning_rate=.05,
                  natural_gradient=True,
                  verbose=False,
                  minibatch_frac=1.0,
                  Base=base_name_to_learner[LEARNER],
                  Score=MLE)

    train = ngb.fit(X, Y)
    Y_imputed = np.copy(Y)

    cens_mask = (Y['Event'] == 0)
    min_vals = Y['Time'][cens_mask]
    pred_dists = train.pred_dist(X[cens_mask])

    try:
        outputs = pred_dists.loc[:, 0]
    except IndexError:
        outputs = pred_dists.loc

    # mus = pred_dists.loc
    # sigmas = pred_dists.scale
    # preds = cond_expectation(estimator, mus, sigmas, min_vals)

    # print(np.sum(cens_mask))
    # print(min_vals.shape, preds.shape)
    # print(min_vals)
    # print(preds)

    # print(min_vals[:10])
    # print(np.exp(pred_dists.loc)[:10])
    # print(pred_dists.mean()[:10])

    Y_imputed['Time'][cens_mask] = np.exp(outputs)
    return Y_imputed
Ejemplo n.º 4
0
    poly_transform = PolynomialFeatures(1)
    x_tr = poly_transform.fit_transform(x_tr)

    ngb = NGBoost(
        Base=default_tree_learner,
        Dist=Normal,
        Score=MLE,
        n_estimators=args.n_estimators,
        learning_rate=args.lr,
        natural_gradient=args.natural,
        minibatch_frac=args.minibatch_frac,
        verbose=True,
    )

    ngb.fit(x_tr, y_tr)

    x_te, y_te, _ = gen_data(n=1000, bound=1.3)
    x_te = poly_transform.transform(x_te)
    preds = ngb.pred_dist(x_te)

    pctles, obs, _, _ = calibration_regression(preds, y_te)

    all_preds = ngb.staged_pred_dist(x_te)
    preds = all_preds[-1]
    plt.figure(figsize=(6, 3))
    plt.scatter(x_tr[:, 1], y_tr, color="black", marker=".", alpha=0.5)
    plt.plot(
        x_te[:, 1],
        preds.loc,
        color="black",
Ejemplo n.º 5
0

        X_train, X_val, y_train, y_val = train_test_split(X_trainall, y_trainall, test_size=0.2)
        
        y_true += list(y_test.flatten())

        ngb = NGBoost(Base=base_name_to_learner[args.base],
                      Dist=eval(args.distn),
                      Score=score_name_to_score[args.score](64),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      minibatch_frac=args.minibatch_frac,
                      verbose=args.verbose)

        train_loss, val_loss = ngb.fit(X_train, y_train) #, X_val, y_val)

        y_preds = ngb.staged_predict(X_val)
        y_forecasts = ngb.staged_pred_dist(X_val)
        val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds]
        val_nll = [-y_forecast.logpdf(y_val.flatten()).mean() for y_forecast in y_forecasts]
        best_itr = np.argmin(val_rmse) + 1
        best_itr = np.argmin(val_nll) + 1

        full_retrain = True
        if full_retrain:
            ngb = NGBoost(Base=base_name_to_learner[args.base],
                      Dist=eval(args.distn),
                      Score=score_name_to_score[args.score](64),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
Ejemplo n.º 6
0
from ngboost.ngboost import NGBoost
from ngboost.distns import Bernoulli
from ngboost.learners import default_tree_learner
from ngboost.scores import MLE

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":

    X, Y = load_breast_cancer(True)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    ngb = NGBoost(Base=default_tree_learner,
                  Dist=Bernoulli,
                  Score=MLE(),
                  verbose=True)
    ngb.fit(X_train, Y_train)

    preds = ngb.pred_dist(X_test)
    print("ROC:", roc_auc_score(Y_test, preds.prob))
Ejemplo n.º 7
0
    T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps
    C = (T < Y).astype(int)

    print(X.shape, Y.shape, C.shape)
    print(f"Censorship: {np.mean(C):.2f}")

    X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split(
        X, Y, T, C, test_size=0.2)

    ngb = NGBoost(Dist=LogNormal,
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=False,
                  Base=default_linear_learner,
                  Score=MLE())
    train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr))

    preds = ngb.pred_dist(X_te)
    print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}")

    plt.hist(preds.mean(), range=(-5, 5), bins=30, alpha=0.5, label="Pred")
    plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True")
    plt.legend()
    plt.show()

    # since we simulated the data we fully observe all outcomes
    pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
    plot_calibration_curve(pctles, observed)
    plt.show()

    pctles, observed, slope, intercept = calibration_time_to_event(
Ejemplo n.º 8
0
    argparser.add_argument("--distn", type=str, default="Normal")
    argparser.add_argument("--natural", action="store_true")
    argparser.add_argument("--score", type=str, default="CRPS")
    args = argparser.parse_args()

    np.random.seed(123)

    m, n = 1200, 50
    noise = np.random.randn(*(m, 1))
    beta1 = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ beta1 + args.noise_lvl * noise
    print(X.shape, Y.shape)

    X_train, X_test = X[:1000, :], X[1000:, ]
    Y_train, Y_test = Y[:1000], Y[1000:]

    ngb = NGBoost(n_estimators=400,
                  learning_rate=args.lr,
                  Dist=Normal,
                  Base=default_linear_learner,
                  natural_gradient=args.natural,
                  minibatch_frac=1.0,
                  Score=eval(args.score)(),
                  verbose=True,
                  verbose_eval=10)

    losses = ngb.fit(X_train, Y_train)
    forecast = ngb.pred_dist(X_test)
    print("R2:", r2_score(Y_test, forecast.loc))
Ejemplo n.º 9
0
    X = np.random.randn(m, n) / np.sqrt(n)
    # Y = X @ beta + 0.5 * noise
    Y = X @ beta1 + 0.5 * np.sqrt(np.exp(X @ beta2)) * noise
    print(X.shape, Y.shape)

    axis = np.linspace(0.0, 2, 200)
    plt.figure(figsize=(8, 3))

    ngb = NGBoost(n_estimators=100,
                  learning_rate=1.0,
                  Dist=Normal,
                  Base=default_linear_learner,
                  natural_gradient=True,
                  minibatch_frac=1.0,
                  Score=CRPS())
    ngb.fit(X, Y)
    preds = ngb.pred_dist(X)
    print(preds.scale.mean())
    print(preds.scale.std())
    pctles, observed, slope, intercept = calibration_regression(preds, Y)

    plt.subplot(1, 2, 1)
    plot_pit_histogram(pctles, observed, label="CRPS", linestyle="--")
    plt.subplot(1, 2, 2)
    plt.plot(axis,
             gaussian_kde(preds.scale)(axis),
             linestyle="--",
             color="black",
             label="CRPS")

    ngb = NGBoost(n_estimators=100,
Ejemplo n.º 10
0
                                                          y_trainall,
                                                          test_size=0.2)

        y_true += list(y_test.flatten())

        ngb = NGBoost(Base=base_name_to_learner[args.base],
                      Dist=eval(args.distn),
                      Score=score_name_to_score[args.score],
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      minibatch_frac=args.minibatch_frac,
                      verbose=args.verbose)

        #train_loss, val_loss
        ngb.fit(X_train, y_train)  #, X_val, y_val)

        y_preds = ngb.staged_predict(X_val)
        y_forecasts = ngb.staged_pred_dist(X_val)
        val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds]
        val_nll = [
            -y_forecast.logpdf(y_val.flatten()).mean()
            for y_forecast in y_forecasts
        ]
        best_itr = np.argmin(val_rmse) + 1
        best_itr = np.argmin(val_nll) + 1

        full_retrain = True
        if full_retrain:
            ngb = NGBoost(Base=base_name_to_learner[args.base],
                          Dist=eval(args.distn),
Ejemplo n.º 11
0
    x_tr, y_tr, _ = gen_data(n=50)

    poly_transform = PolynomialFeatures(1)
    x_tr = poly_transform.fit_transform(x_tr)

    ngb = NGBoost(Base=default_tree_learner,
                  Dist=Normal,
                  Score=MLE(),
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=args.natural,
                  minibatch_frac=args.minibatch_frac,
                  verbose=True)

    train_loss, val_loss = ngb.fit(x_tr, y_tr)

    x_te, y_te, _ = gen_data(n=1000, bound=1.3)
    x_te = poly_transform.transform(x_te)
    preds = ngb.pred_dist(x_te)

    pctles, obs, _, _ = calibration_regression(preds, y_te)

    all_preds = ngb.staged_pred_dist(x_te)
    preds = all_preds[-1]
    plt.figure(figsize=(6, 3))
    plt.scatter(x_tr[:, 1], y_tr, color="black", marker=".", alpha=0.5)
    plt.plot(x_te[:, 1],
             preds.loc,
             color="black",
             linestyle="-",
Ejemplo n.º 12
0
                                                            Y,
                                                            test_size=0.2)
        X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                          Y_train,
                                                          test_size=0.2)

        ngb = NGBoost(Dist=eval(args.distn),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      verbose=args.verbose,
                      minibatch_frac=1.0,
                      Base=base_name_to_learner[args.base],
                      Score=eval(args.score)())

        train_losses = ngb.fit(X_train, Y_train)  #, X_val, Y_val)
        forecast = ngb.pred_dist(X_test)
        train_forecast = ngb.pred_dist(X_train)
        print('NGB score: %.4f (val), %.4f (train)' %
              (concordance_index_censored(Y_test['Event'], Y_test['Time'],
                                          -forecast.mean())[0],
               concordance_index_censored(Y_train['Event'], Y_train['Time'],
                                          -train_forecast.mean())[0]))
        #logger.tick(forecast, Y_test)

        ##
        ## sksurv
        ##
        gbsa = GBSA(n_estimators=args.n_est,
                    learning_rate=args.lr,
                    subsample=args.minibatch_frac,
Ejemplo n.º 13
0
    start = datetime.now().timestamp()
    qreg = MLPQuantile()
    qreg.fit(X_train_std,y_train)
    preds = qreg.predict(X_test_std)
    end = datetime.now().timestamp()
    results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values)
    results["duration"]=end-start
    save_result([horizon,
                    "MLP",
                    results,
                    1],f"unit_{horizon}",folder)

    start = datetime.now().timestamp()
    ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True,
              verbose=True,n_estimators=1500)
    ngb.fit(X_train_std, y_train.values)
    Y_dists = ngb.pred_dist(X_test_std)
    a=pd.DataFrame()
    for i in np.arange(1,100):
        a[i]=Y_dists.ppf(i/100)
    preds = a.values
    end = datetime.now().timestamp()
    results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values)
    results["duration"]=end-start
    save_result([horizon,
                    "NGBOOST",
                    results,
                    1],f"unit_{horizon}",folder)

                    
Ejemplo n.º 14
0
        noise = sp.stats.laplace.rvs(size=(m, 1))
    beta = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = np.exp(X @ beta + 0.5 * noise)
    print(X.shape, Y.shape)

    dist = eval("Log" + args.dist)

    ngb = NGBoost(n_estimators=50,
                  learning_rate=0.5,
                  Dist=dist,
                  Base=default_linear_learner,
                  natural_gradient=False,
                  minibatch_frac=1.0,
                  Score=CRPS())
    losses = ngb.fit(X, Y)

    preds = ngb.pred_dist(X)

    print(f"R2: {r2_score(Y, np.exp(preds.loc)):.4f}")
    pctles, observed, slope, intercept = calibration_regression(preds, Y)

    plt.figure(figsize=(8, 3))
    plt.subplot(1, 2, 1)
    plot_pit_histogram(pctles, observed)
    plt.title("Original scale")

    Y = np.log(Y)
    dist = eval(args.dist)

    ngb = NGBoost(n_estimators=50,
Ejemplo n.º 15
0
    T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps
    C = (T < Y).astype(int)

    print(X.shape, Y.shape, C.shape)
    print(f"Censorship: {np.mean(C):.2f}")

    X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split(
        X, Y, T, C, test_size=0.2)

    ngb = NGBoost(Dist=Laplace,
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=False,
                  Base=default_linear_learner,
                  Score=MLE_SURV())
    train_losses = ngb.fit(X_tr, np.c_[np.minimum(Y_tr, T_tr), C_tr])

    preds = ngb.pred_dist(X_te)
    print(f"R2: {r2_score(Y_te, preds.loc)}")

    plt.hist(preds.loc, range=(-5, 5), bins=30, alpha=0.5, label="Pred")
    plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True")
    plt.legend()
    plt.show()

    # since we simulated the data we fully observe all outcomes
    pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
    plot_calibration_curve(pctles, observed)
    print(f"== Mean SD: {preds.scale.mean()}")
    plt.show()