def test_edm(): df = pd.read_csv('edm.csv') target = df.loc[:, ['DFlow', 'DGap']] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.880826520747869, min_samples_leaf=12, learning_rate=0.22445307581959334, max_iter=279, n_iter_no_change=23, ) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_atp1d(): df = pd.read_csv('atp1d.csv') target = df.loc[:, df.columns.str.startswith('LBL')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor( # l2_regularization=0.003391634274257872, # min_samples_leaf=10, # learning_rate=0.1088115324113492, # max_iter=199, # n_iter_no_change=20 ) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict(test_scaled) for y in y_preds: print(y) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_wq(): df = pd.read_csv('water-quality.csv') target = df.loc[:, df.columns.str.startswith('x')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor(l2_regularization=0.07509314619453317, min_samples_leaf=15, learning_rate=0.01948991297099692, max_iter=300, n_iter_no_change=17) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_atp7d(): df = pd.read_csv('atp7d.csv') target = df.loc[:, df.columns.str.startswith('LBL')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor( scoring='neg_mean_absolute_error', l2_regularization=0.880826520747869, min_samples_leaf=12, learning_rate=0.22445307581959334, max_iter=9999, n_iter_no_change=100, validation_split=0.1, verbose=1, ) # scaler = StandardScaler() # scaler.fit(X_train) # train_scaled = scaler.transform(X_train) # test_scaled = scaler.transform(X_test) # scalery = StandardScaler() # scalery.fit(y_train) # y_Train = scalery.transform(y_train) # y_Test = scalery.transform(y_test) gb.fit(X_train, y_train) print('Fitting end') y_preds = gb.predict(X_test) r2 = r2_score(y_test, y_preds, multioutput='uniform_average') print(r2)
def test_scm20d(): df = pd.read_csv('scm20d.csv') target = df.loc[:, df.columns.str.contains('L')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() c = 1503.0 / 7463.0 X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=c, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.8640187696889217, min_samples_leaf=19, learning_rate=0.1164232801613771, max_iter=1998, n_iter_no_change=None, ) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_scm1d(): df = pd.read_csv('scm1d.csv') target = df.loc[:, df.columns.str.contains('L')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=1658.0 / 8145.0, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.07054193143238725, min_samples_leaf=23, learning_rate=0.12336530854190006, max_iter=1999, n_iter_no_change=None, ) # scaler = StandardScaler() # scaler.fit(X_train) # train_scaled = scaler.transform(X_train) # test_scaled = scaler.transform(X_test) # scalery = StandardScaler() # scalery.fit(y_train) # y_Train = scalery.transform(y_train) # y_Test = scalery.transform(y_test) gb.fit(X_train, y_train) y_preds = gb.predict(X_test) r2 = r2_score(y_test, y_preds, multioutput='uniform_average') print(r2)
def test_pre_binned_data(): # Make sure that: # - training on numerical data and predicting on numerical data is the # same as training on binned data and predicting on binned data # - training on numerical data and predicting on numerical data is the # same as training on numerical data and predicting on binned data # - training on binned data and predicting on numerical data is not # possible. X, y = make_regression(random_state=0) gbdt = GradientBoostingRegressor(scoring=None, random_state=0) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) fit_num_pred_num = gbdt.fit(X, y).predict(X) fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) assert_allclose(fit_num_pred_num, fit_binned_pred_binned) assert_allclose(fit_num_pred_num, fit_num_pred_binned) assert_raises_regex( ValueError, 'This estimator was fitted with pre-binned data ', gbdt.fit(X_binned, y).predict, X )
def test_early_stopping_regression(scoring, validation_split, n_iter_no_change, tol): print('what') max_iter = 500 X, y = make_regression(random_state=0) gb = GradientBoostingRegressor(verbose=1, # just for coverage scoring=scoring, tol=tol, validation_split=validation_split, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) print(gb.predict(X)) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
check.func is estimator_checks.check_classifiers_train): continue # same, wrapped in a functools.partial object. try: check(name, estimator) except SkipTest as exception: # the only SkipTest thrown currently results from not # being able to import pandas. warnings.warn(str(exception), SkipTestWarning) @pytest.mark.skipif( int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, reason="Potentially long") @pytest.mark.parametrize('Estimator', ( GradientBoostingRegressor(), GradientBoostingClassifier(n_iter_no_change=None, min_samples_leaf=5),)) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. # Notes: # - Can't do early stopping with classifier because often # validation_split=.1 leads to test_size=2 < n_classes and # train_test_split raises an error. # - Also, need to set a low min_samples_leaf for # check_classifiers_classes() to pass: with only 30 samples on the # dataset, the root is never split with min_samples_leaf=20 and only the # majority class is predicted. custom_check_estimator(Estimator)