예제 #1
0
    def test_rgs(self):

        np.random.seed(1)
        n_samples = 10000
        test_size = 0.2
        n_est = 100
        max_depth = 7
        lr = 0.1

        X, y = make_friedman1_poly(n_samples=n_samples) 
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=test_size)

        model_palo = PaloBoost(distribution="gaussian",
                                n_estimators=n_est,
                                learning_rate=lr,
                                max_depth=max_depth)
        model_sklr = GradientBoostingRegressor(
                            n_estimators=n_est, 
                            learning_rate=lr,
                            max_depth=max_depth)

        model_palo.fit(X_train, y_train)
        y_hat = model_palo.predict(X_test)
        rmse_palo = np.sqrt(np.mean((y_test - y_hat)**2))

        model_sklr.fit(X_train, y_train)
        y_hat = model_sklr.predict(X_test)
        rmse_sklr = np.sqrt(np.mean((y_test - y_hat)**2))

        self.assertTrue(rmse_palo < rmse_sklr)
예제 #2
0
    def test_cls(self):

        np.random.seed(1)
        n_samples = 10000
        test_size = 0.2
        n_est = 100
        max_depth = 7
        lr = 0.1

        X, y = make_hastie_11_2(n_samples)
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=test_size)

        model_palo = PaloBoost(distribution="bernoulli",
                                n_estimators=n_est, 
                                learning_rate=lr,
                                max_depth=max_depth)
        model_sklr = GradientBoostingClassifier(
                                n_estimators=n_est, 
                                learning_rate=lr,
                                max_depth=max_depth)


        model_palo.fit(X_train, y_train)
        y_hat = model_palo.predict_proba(X_test)[:,1]
        auc_palo = roc_auc_score(y_test, y_hat)

        model_sklr.fit(X_train, y_train)
        y_hat = model_sklr.predict_proba(X_test)[:,1]
        auc_sklr = roc_auc_score(y_test, y_hat)

        self.assertTrue(auc_palo > auc_sklr)
예제 #3
0
 def load(self, model):
     # NOTE: not yet
     self.calibrators = model["clb"]
     self.calibrate = model["calibrate"]
     self.distribution = model["distribution"]
     self.estimators = []
     for d in model["est"]:
         est = PaloBoost()
         est.load(d)
         self.estimators.append(est)
예제 #4
0
    def fit(self, X, y):
        np.random.seed(self.random_state)
        n, m = X.shape
        idx = np.arange(n)
        self.estimators = []

        if (self.distribution == "bernoulli"
                and (np.sum(y) < 3 or np.sum(y) > n - 3)):
            logging.error(("the target (y) needs to have "
                           "at least one examples on each class"))
            return None

        i = 0
        while i < self.n_paloboost:
            mask = np.full(n, True)
            if self.block_size is not None:
                n_block = int(n / self.block_size) + 1
                mask_block = (np.random.rand(n_block) < self.subsample0)
                mask = np.repeat(mask_block, self.block_size)[:n]
            else:
                mask = (np.random.rand(n) < self.subsample0)

            X_i, y_i = X[mask, :], y[mask]
            X_j, y_j = X[~mask, :], y[~mask]

            if (self.distribution == "bernoulli"
                    and (np.unique(y_i).shape[0] == 1
                         or np.unique(y_j).shape[0] == 1)):
                continue

            est = PaloBoost(distribution=self.distribution,
                            learning_rate=self.learning_rate,
                            max_depth=self.max_depth,
                            n_estimators=self.n_estimators,
                            subsample=self.subsample1,
                            subsample_splts=self.subsample2,
                            random_state=i * self.n_estimators)
            est.fit(X_i, y_i)
            self.estimators.append(est)
            if self.feature_importances_ is None:
                self.feature_importances_ = est.feature_importances_
            else:
                self.feature_importances_ += est.feature_importances_

            if (self.distribution == "bernoulli" and self.calibrate):
                z_j = est.predict_proba(X_j)[:, 1]
                clb = IsotonicRegression(y_min=0,
                                         y_max=1,
                                         out_of_bounds="clip")
                clb.fit(z_j, y_j)
                self.calibrators.append(clb)
            i += 1

        self.feature_importances_ /= self.n_paloboost
예제 #5
0
def test_classification():

    X, y = make_hastie_10_2(n_samples=1000)
    y[y < 0] = 0
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    models = {
        "palobst":
        PaloBoost(
            distribution="bernoulli",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "gbm":
        GBM(
            distribution="bernoulli",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "sklearn":
        GradientBoostingClassifier(n_estimators=100,
                                   learning_rate=1.0,
                                   max_depth=4,
                                   subsample=0.5),
    }

    print("\n")
    print("# Test Classification")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   auc    ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       0.5))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict_proba(X_test)[:, 1]
        time_pred = time.time() - start

        # Error
        auc = roc_auc_score(y_test, y_hat)

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, auc))

    print("-----------------------------------------------------")
    print("\n")
예제 #6
0
def test_regression():

    X, y = make_friedman1(n_samples=100000, noise=5)
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    models = {
        "palobst":
        PaloBoost(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "gbm":
        GBM(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "sklearn":
        GradientBoostingRegressor(n_estimators=100,
                                  learning_rate=1.0,
                                  max_depth=4,
                                  subsample=0.5),
    }

    print("\n")
    print("# Test Regression")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   rmse   ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       np.std(y_test)))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict(X_test)
        time_pred = time.time() - start

        # Error
        rmse = np.sqrt(np.mean((y_test - y_hat)**2))

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, rmse))

    print("-----------------------------------------------------")
    print("\n")
예제 #7
0
def clstask(X, y, n_estimators, learning_rate, max_depth, n_btstrp,
            has_missing, test_size, add_noise):
    models = {
        "0. PaloBoost":
        PaloBoost(distribution="bernoulli",
                  n_estimators=n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  subsample=0.7),
        "1. SGTB-Bonsai":
        GBM(distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7),
        "2. XGBoost":
        XGBClassifier(n_estimators=n_estimators,
                      learning_rate=learning_rate,
                      max_depth=max_depth,
                      subsample=0.7)
    }
    if not has_missing:
        models["3. Scikit-Learn"] = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7)
    perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"])
    for idx in range(n_btstrp):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        if add_noise:
            n_train = y_train.shape[0]
            mask = np.random.rand(n_train) < 0.2  # 20%
            y_train[mask] = 1 - y_train[mask]  # flip

        df = utils.get_cls_perf(models, X_train, y_train, X_test, y_test,
                                n_estimators)
        df['b_idx'] = idx
        perf_df = perf_df.append(df, sort=True)
    return perf_df
예제 #8
0
def regtask(X, y, n_estimators, learning_rate, max_depth, n_btstrp,
            has_missing, test_size):
    models = {
        "0. PaloBoost":
        PaloBoost(distribution="gaussian",
                  n_estimators=n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  subsample=0.7),
        "1. SGTB-Bonsai":
        GBM(distribution="gaussian",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7),
        "2. XGBoost":
        XGBRegressor(n_estimators=n_estimators,
                     learning_rate=learning_rate,
                     max_depth=max_depth,
                     subsample=0.7)
    }
    if not has_missing:
        models["3. Scikit-Learn"] = GradientBoostingRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=0.7)

    perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"])
    for idx in range(n_btstrp):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        df = utils.get_reg_perf(models, X_train, y_train, X_test, y_test,
                                n_estimators)
        df['b_idx'] = idx
        perf_df = perf_df.append(df, sort=True)
    return perf_df
예제 #9
0
def run_aux(learning_rate, max_depth, n_estimators=200):

    X, y = get_friedman()
    model = PaloBoost(distribution="gaussian",
                      n_estimators=n_estimators,
                      learning_rate=learning_rate,
                      max_depth=max_depth,
                      subsample=0.7)
    model.fit(X, y)
    prune_df = pd.DataFrame(model.get_prune_stats())
    prune_df.columns = ["iteration", "nodes_pre", "nodes_post"]
    lr_df = pd.DataFrame(model.get_lr_stats())
    lr_df.columns = ["iteration", "lr"]

    prune_df.to_csv("results/prune_{}_{}.csv".format(learning_rate, max_depth),
                    index=False)
    lr_df.to_csv("results/lr_{}_{}.csv".format(learning_rate, max_depth),
                 index=False)
예제 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("outfile", help="filename for performance (csv)")
    parser.add_argument("-n",
                        type=int,
                        default=200,
                        help="number of estimators")
    parser.add_argument("-lr", type=float, default=1.0, help="learning rate")
    parser.add_argument("-sub", type=float, default=0.7, help="subsample rate")
    parser.add_argument("-depth", type=int, default=5, help="subsample rate")
    args = parser.parse_args()

    # Parameters
    n_estimators = args.n
    learning_rate = args.lr  # 1.0, 0.5, 0.1
    test_size = (
        0.7
    )  # 30% training, 70% test - to highlight the overfitting aspect of the models
    subsample = args.sub
    max_depth = args.depth

    data = pd.read_csv("data/featureSet3_48.csv")
    outcomes = pd.read_csv("data/outcomes-a.txt")
    outcomes = outcomes[["RecordID", "In-hospital_death"]]
    data = pd.merge(data, outcomes, how="inner", on="RecordID")
    col_names = data.columns
    col_names_x = [
        cname for cname in col_names
        if cname not in ["RecordID", "Length_of_stay", "In-hospital_death"]
    ]
    X = pp.simple_pp(data[col_names_x]).values
    y = data["In-hospital_death"].values

    print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y)))
    models = {
        "0. PaloBoost    ":
        PaloBoost(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "1. SGTB-Bonsai":
        GBM(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "2. XGBoost      ":
        XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
    }
    boostPerf = pd.DataFrame(columns=[
        "0. PaloBoost    ",
        "1. SGTB-Bonsai",
        "2. XGBoost      ",
        "nEst",
        "idx",
    ])
    for idx in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx)
        perf_df = evalutils.get_cls_perf(models, X_train, y_train, X_test,
                                         y_test, n_estimators)
        perf_df["idx"] = idx
        boostPerf = boostPerf.append(perf_df)
    # store it to the file
    boostPerf.to_csv(
        (args.outfile + "_{0}_{1}_{2}_{3}.csv".format(
            n_estimators, learning_rate, max_depth, subsample)),
        index=False,
    )
    # spit out the highest max for each class
    tmpDF = boostPerf.groupby(["idx"]).max()
    print(tmpDF.mean())
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("outfile", help="filename for performance (csv)")
    parser.add_argument("-n", type=int, default=200, help="number of estimators")
    parser.add_argument("-lr", type=float, default=1.0, help="learning rate")
    parser.add_argument("-sub", type=float, default=0.7, help="subsample rate")
    parser.add_argument("-depth", type=int, default=5, help="subsample rate")
    args = parser.parse_args()

    # Parameters
    n_estimators = args.n
    learning_rate = args.lr  # 1.0, 0.5, 0.1
    test_size = (
        0.7
    )  # 30% training, 70% test - to highlight the overfitting aspect of the models
    subsample = args.sub
    max_depth = args.depth

    data = pd.read_csv("data/6Hr-data.csv")
    y = data["ca"].values
    X = pp.simple_pp(data.drop(columns="ca")).values

    print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y)))

    models = {
        "0. PaloBoost    ": PaloBoost(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "1. SGTB-Bonsai": GBM(
            distribution="bernoulli",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "2. XGBoost      ": XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
        "3. Scikit-Learn ": GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
        ),
    }
    boostPerf = pd.DataFrame(
        columns=[
            "0. PaloBoost    ",
            "1. SGTB-Bonsai",
            "2. XGBoost      ",
            "3. Scikit-Learn ",
            "nEst",
            "idx",
        ]
    )
    for idx in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=idx
        )
        perf_df = eval_utils.get_cls_perf(
            models, X_train, y_train, X_test, y_test, n_estimators
        )
        perf_df["idx"] = idx
        boostPerf = boostPerf.append(perf_df)
    # store it to the file
    boostPerf.to_csv(
        (
            args.outfile
            + "_{0}_{1}_{2}_{3}.csv".format(
                n_estimators, learning_rate, max_depth, subsample
            )
        ),
        index=False,
    )
    # spit out the highest max for each class
    tmpDF = boostPerf.groupby(["idx"]).max()
    print(tmpDF.mean())