Python GBM Exemples, bonsai.ensemble.gbm.GBM Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_gbm.py Projet : yubin-park/bonsai-dt

    def test_fit(self):

        n_samples = 10000
        test_size = 0.2
        max_depth = 3
        lr = 0.1
        n_est = 100

        X, y = make_friedman1(n_samples=n_samples)
        n, m = X.shape
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size)

        model = GBM(distribution="gaussian",
                    n_estimators=n_est,
                    learning_rate=lr,
                    max_depth=max_depth)

        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        mse_gbm = np.mean((y_test - y_hat)**2)
        mse_baseline = np.mean((y_test - np.mean(y_train))**2)

        self.assertTrue(mse_gbm < mse_baseline)

Exemple #2

0

Afficher le fichier

Fichier : paloboost.py Projet : eliavw/bonsai-dt

def test_classification():

    X, y = make_hastie_10_2(n_samples=1000)
    y[y < 0] = 0
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    models = {
        "palobst":
        PaloBoost(
            distribution="bernoulli",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "gbm":
        GBM(
            distribution="bernoulli",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "sklearn":
        GradientBoostingClassifier(n_estimators=100,
                                   learning_rate=1.0,
                                   max_depth=4,
                                   subsample=0.5),
    }

    print("\n")
    print("# Test Classification")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   auc    ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       0.5))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict_proba(X_test)[:, 1]
        time_pred = time.time() - start

        # Error
        auc = roc_auc_score(y_test, y_hat)

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, auc))

    print("-----------------------------------------------------")
    print("\n")

Exemple #3

0

Afficher le fichier

Fichier : paloboost.py Projet : eliavw/bonsai-dt

def test_regression():

    X, y = make_friedman1(n_samples=100000, noise=5)
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    models = {
        "palobst":
        PaloBoost(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "gbm":
        GBM(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "sklearn":
        GradientBoostingRegressor(n_estimators=100,
                                  learning_rate=1.0,
                                  max_depth=4,
                                  subsample=0.5),
    }

    print("\n")
    print("# Test Regression")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   rmse   ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       np.std(y_test)))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict(X_test)
        time_pred = time.time() - start

        # Error
        rmse = np.sqrt(np.mean((y_test - y_hat)**2))

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, rmse))

    print("-----------------------------------------------------")
    print("\n")

Exemple #4

0

Afficher le fichier

Fichier : run_experiments.py Projet : yubin-park/bonsai-dt

def clstask(X, y, n_estimators, learning_rate, max_depth, n_btstrp,
            has_missing, test_size, add_noise):
    models = {
        "0. PaloBoost":
        PaloBoost(distribution="bernoulli",
                  n_estimators=n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  subsample=0.7),
        "1. SGTB-Bonsai":
        GBM(distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7),
        "2. XGBoost":
        XGBClassifier(n_estimators=n_estimators,
                      learning_rate=learning_rate,
                      max_depth=max_depth,
                      subsample=0.7)
    }
    if not has_missing:
        models["3. Scikit-Learn"] = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7)
    perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"])
    for idx in range(n_btstrp):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        if add_noise:
            n_train = y_train.shape[0]
            mask = np.random.rand(n_train) < 0.2  # 20%
            y_train[mask] = 1 - y_train[mask]  # flip

        df = utils.get_cls_perf(models, X_train, y_train, X_test, y_test,
                                n_estimators)
        df['b_idx'] = idx
        perf_df = perf_df.append(df, sort=True)
    return perf_df

Exemple #5

0

Afficher le fichier

Fichier : run_experiments.py Projet : yubin-park/bonsai-dt

def regtask(X, y, n_estimators, learning_rate, max_depth, n_btstrp,
            has_missing, test_size):
    models = {
        "0. PaloBoost":
        PaloBoost(distribution="gaussian",
                  n_estimators=n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  subsample=0.7),
        "1. SGTB-Bonsai":
        GBM(distribution="gaussian",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7),
        "2. XGBoost":
        XGBRegressor(n_estimators=n_estimators,
                     learning_rate=learning_rate,
                     max_depth=max_depth,
                     subsample=0.7)
    }
    if not has_missing:
        models["3. Scikit-Learn"] = GradientBoostingRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7)

    perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"])
    for idx in range(n_btstrp):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        df = utils.get_reg_perf(models, X_train, y_train, X_test, y_test,
                                n_estimators)
        df['b_idx'] = idx
        perf_df = perf_df.append(df, sort=True)
    return perf_df

Exemple #6

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("outfile", help="filename for performance (csv)")
    parser.add_argument("-n",
                        type=int,
                        default=200,
                        help="number of estimators")
    parser.add_argument("-lr", type=float, default=1.0, help="learning rate")
    parser.add_argument("-sub", type=float, default=0.7, help="subsample rate")
    parser.add_argument("-depth", type=int, default=5, help="subsample rate")
    args = parser.parse_args()

    # Parameters
    n_estimators = args.n
    learning_rate = args.lr  # 1.0, 0.5, 0.1
    test_size = (
        0.7
    )  # 30% training, 70% test - to highlight the overfitting aspect of the models
    subsample = args.sub
    max_depth = args.depth

    data = pd.read_csv("data/featureSet3_48.csv")
    outcomes = pd.read_csv("data/outcomes-a.txt")
    outcomes = outcomes[["RecordID", "In-hospital_death"]]
    data = pd.merge(data, outcomes, how="inner", on="RecordID")
    col_names = data.columns
    col_names_x = [
        cname for cname in col_names
        if cname not in ["RecordID", "Length_of_stay", "In-hospital_death"]
    ]
    X = pp.simple_pp(data[col_names_x]).values
    y = data["In-hospital_death"].values

    print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y)))
    models = {
        "0. PaloBoost    ":
        PaloBoost(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "1. SGTB-Bonsai":
        GBM(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "2. XGBoost      ":
        XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
    }
    boostPerf = pd.DataFrame(columns=[
        "0. PaloBoost    ",
        "1. SGTB-Bonsai",
        "2. XGBoost      ",
        "nEst",
        "idx",
    ])
    for idx in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        perf_df = evalutils.get_cls_perf(models, X_train, y_train, X_test,
                                         y_test, n_estimators)
        perf_df["idx"] = idx
        boostPerf = boostPerf.append(perf_df)
    # store it to the file
    boostPerf.to_csv(
        (args.outfile + "_{0}_{1}_{2}_{3}.csv".format(
            n_estimators, learning_rate, max_depth, subsample)),
        index=False,
    )
    # spit out the highest max for each class
    tmpDF = boostPerf.groupby(["idx"]).max()
    print(tmpDF.mean())

Exemple #7

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("outfile", help="filename for performance (csv)")
    parser.add_argument("-n", type=int, default=200, help="number of estimators")
    parser.add_argument("-lr", type=float, default=1.0, help="learning rate")
    parser.add_argument("-sub", type=float, default=0.7, help="subsample rate")
    parser.add_argument("-depth", type=int, default=5, help="subsample rate")
    args = parser.parse_args()

    # Parameters
    n_estimators = args.n
    learning_rate = args.lr  # 1.0, 0.5, 0.1
    test_size = (
        0.7
    )  # 30% training, 70% test - to highlight the overfitting aspect of the models
    subsample = args.sub
    max_depth = args.depth

    data = pd.read_csv("data/6Hr-data.csv")
    y = data["ca"].values
    X = pp.simple_pp(data.drop(columns="ca")).values

    print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y)))

    models = {
        "0. PaloBoost    ": PaloBoost(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "1. SGTB-Bonsai": GBM(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "2. XGBoost      ": XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "3. Scikit-Learn ": GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
    }
    boostPerf = pd.DataFrame(
        columns=[
            "0. PaloBoost    ",
            "1. SGTB-Bonsai",
            "2. XGBoost      ",
            "3. Scikit-Learn ",
            "nEst",
            "idx",
        ]
    )
    for idx in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx
        )
        perf_df = eval_utils.get_cls_perf(
            models, X_train, y_train, X_test, y_test, n_estimators
        )
        perf_df["idx"] = idx
        boostPerf = boostPerf.append(perf_df)
    # store it to the file
    boostPerf.to_csv(
        (
            args.outfile
            + "_{0}_{1}_{2}_{3}.csv".format(
                n_estimators, learning_rate, max_depth, subsample
            )
        ),
        index=False,
    )
    # spit out the highest max for each class
    tmpDF = boostPerf.groupby(["idx"]).max()
    print(tmpDF.mean())