def test_fit(self): n_samples = 10000 test_size = 0.2 max_depth = 3 lr = 0.1 n_est = 100 X, y = make_friedman1(n_samples=n_samples) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size) model = GBM(distribution="gaussian", n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model.fit(X_train, y_train) y_hat = model.predict(X_test) mse_gbm = np.mean((y_test - y_hat)**2) mse_baseline = np.mean((y_test - np.mean(y_train))**2) self.assertTrue(mse_gbm < mse_baseline)
def test_classification(): X, y = make_hastie_10_2(n_samples=1000) y[y < 0] = 0 n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) models = { "palobst": PaloBoost( distribution="bernoulli", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "gbm": GBM( distribution="bernoulli", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "sklearn": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5), } print("\n") print("# Test Classification") print("-----------------------------------------------------") print(" model_name train_time predict_time auc ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", 0.5)) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict_proba(X_test)[:, 1] time_pred = time.time() - start # Error auc = roc_auc_score(y_test, y_hat) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, auc)) print("-----------------------------------------------------") print("\n")
def test_regression(): X, y = make_friedman1(n_samples=100000, noise=5) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) models = { "palobst": PaloBoost( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "gbm": GBM( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "sklearn": GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5), } print("\n") print("# Test Regression") print("-----------------------------------------------------") print(" model_name train_time predict_time rmse ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", np.std(y_test))) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict(X_test) time_pred = time.time() - start # Error rmse = np.sqrt(np.mean((y_test - y_hat)**2)) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, rmse)) print("-----------------------------------------------------") print("\n")
def clstask(X, y, n_estimators, learning_rate, max_depth, n_btstrp, has_missing, test_size, add_noise): models = { "0. PaloBoost": PaloBoost(distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "1. SGTB-Bonsai": GBM(distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "2. XGBoost": XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) } if not has_missing: models["3. Scikit-Learn"] = GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"]) for idx in range(n_btstrp): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) if add_noise: n_train = y_train.shape[0] mask = np.random.rand(n_train) < 0.2 # 20% y_train[mask] = 1 - y_train[mask] # flip df = utils.get_cls_perf(models, X_train, y_train, X_test, y_test, n_estimators) df['b_idx'] = idx perf_df = perf_df.append(df, sort=True) return perf_df
def regtask(X, y, n_estimators, learning_rate, max_depth, n_btstrp, has_missing, test_size): models = { "0. PaloBoost": PaloBoost(distribution="gaussian", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "1. SGTB-Bonsai": GBM(distribution="gaussian", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "2. XGBoost": XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) } if not has_missing: models["3. Scikit-Learn"] = GradientBoostingRegressor( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"]) for idx in range(n_btstrp): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) df = utils.get_reg_perf(models, X_train, y_train, X_test, y_test, n_estimators) df['b_idx'] = idx perf_df = perf_df.append(df, sort=True) return perf_df
def main(): parser = argparse.ArgumentParser() parser.add_argument("outfile", help="filename for performance (csv)") parser.add_argument("-n", type=int, default=200, help="number of estimators") parser.add_argument("-lr", type=float, default=1.0, help="learning rate") parser.add_argument("-sub", type=float, default=0.7, help="subsample rate") parser.add_argument("-depth", type=int, default=5, help="subsample rate") args = parser.parse_args() # Parameters n_estimators = args.n learning_rate = args.lr # 1.0, 0.5, 0.1 test_size = ( 0.7 ) # 30% training, 70% test - to highlight the overfitting aspect of the models subsample = args.sub max_depth = args.depth data = pd.read_csv("data/featureSet3_48.csv") outcomes = pd.read_csv("data/outcomes-a.txt") outcomes = outcomes[["RecordID", "In-hospital_death"]] data = pd.merge(data, outcomes, how="inner", on="RecordID") col_names = data.columns col_names_x = [ cname for cname in col_names if cname not in ["RecordID", "Length_of_stay", "In-hospital_death"] ] X = pp.simple_pp(data[col_names_x]).values y = data["In-hospital_death"].values print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y))) models = { "0. PaloBoost ": PaloBoost( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "1. SGTB-Bonsai": GBM( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "2. XGBoost ": XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), } boostPerf = pd.DataFrame(columns=[ "0. PaloBoost ", "1. SGTB-Bonsai", "2. XGBoost ", "nEst", "idx", ]) for idx in range(10): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) perf_df = evalutils.get_cls_perf(models, X_train, y_train, X_test, y_test, n_estimators) perf_df["idx"] = idx boostPerf = boostPerf.append(perf_df) # store it to the file boostPerf.to_csv( (args.outfile + "_{0}_{1}_{2}_{3}.csv".format( n_estimators, learning_rate, max_depth, subsample)), index=False, ) # spit out the highest max for each class tmpDF = boostPerf.groupby(["idx"]).max() print(tmpDF.mean())
def main(): parser = argparse.ArgumentParser() parser.add_argument("outfile", help="filename for performance (csv)") parser.add_argument("-n", type=int, default=200, help="number of estimators") parser.add_argument("-lr", type=float, default=1.0, help="learning rate") parser.add_argument("-sub", type=float, default=0.7, help="subsample rate") parser.add_argument("-depth", type=int, default=5, help="subsample rate") args = parser.parse_args() # Parameters n_estimators = args.n learning_rate = args.lr # 1.0, 0.5, 0.1 test_size = ( 0.7 ) # 30% training, 70% test - to highlight the overfitting aspect of the models subsample = args.sub max_depth = args.depth data = pd.read_csv("data/6Hr-data.csv") y = data["ca"].values X = pp.simple_pp(data.drop(columns="ca")).values print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y))) models = { "0. PaloBoost ": PaloBoost( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "1. SGTB-Bonsai": GBM( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "2. XGBoost ": XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "3. Scikit-Learn ": GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), } boostPerf = pd.DataFrame( columns=[ "0. PaloBoost ", "1. SGTB-Bonsai", "2. XGBoost ", "3. Scikit-Learn ", "nEst", "idx", ] ) for idx in range(10): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx ) perf_df = eval_utils.get_cls_perf( models, X_train, y_train, X_test, y_test, n_estimators ) perf_df["idx"] = idx boostPerf = boostPerf.append(perf_df) # store it to the file boostPerf.to_csv( ( args.outfile + "_{0}_{1}_{2}_{3}.csv".format( n_estimators, learning_rate, max_depth, subsample ) ), index=False, ) # spit out the highest max for each class tmpDF = boostPerf.groupby(["idx"]).max() print(tmpDF.mean())