Exemple #1
0
def test_edm():
    df = pd.read_csv('edm.csv')
    target = df.loc[:, ['DFlow', 'DGap']]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.880826520747869,
        min_samples_leaf=12,
        learning_rate=0.22445307581959334,
        max_iter=279,
        n_iter_no_change=23,
    )
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #2
0
def test_atp1d():
    df = pd.read_csv('atp1d.csv')
    target = df.loc[:, df.columns.str.startswith('LBL')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        # l2_regularization=0.003391634274257872,
        # min_samples_leaf=10,
        # learning_rate=0.1088115324113492,
        # max_iter=199,
        # n_iter_no_change=20
    )
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict(test_scaled)
    for y in y_preds:
        print(y)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #3
0
def test_wq():
    df = pd.read_csv('water-quality.csv')
    target = df.loc[:, df.columns.str.startswith('x')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(l2_regularization=0.07509314619453317,
                                   min_samples_leaf=15,
                                   learning_rate=0.01948991297099692,
                                   max_iter=300,
                                   n_iter_no_change=17)
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #4
0
def test_atp7d():
    df = pd.read_csv('atp7d.csv')
    target = df.loc[:, df.columns.str.startswith('LBL')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        scoring='neg_mean_absolute_error',
        l2_regularization=0.880826520747869,
        min_samples_leaf=12,
        learning_rate=0.22445307581959334,
        max_iter=9999,
        n_iter_no_change=100,
        validation_split=0.1,
        verbose=1,
    )
    # scaler = StandardScaler()
    # scaler.fit(X_train)
    # train_scaled = scaler.transform(X_train)
    # test_scaled = scaler.transform(X_test)
    # scalery = StandardScaler()
    # scalery.fit(y_train)
    # y_Train = scalery.transform(y_train)
    # y_Test = scalery.transform(y_test)
    gb.fit(X_train, y_train)
    print('Fitting end')
    y_preds = gb.predict(X_test)
    r2 = r2_score(y_test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #5
0
def test_scm20d():
    df = pd.read_csv('scm20d.csv')
    target = df.loc[:, df.columns.str.contains('L')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    c = 1503.0 / 7463.0
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=c,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.8640187696889217,
        min_samples_leaf=19,
        learning_rate=0.1164232801613771,
        max_iter=1998,
        n_iter_no_change=None,
    )
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #6
0
def test_scm1d():
    df = pd.read_csv('scm1d.csv')
    target = df.loc[:, df.columns.str.contains('L')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=1658.0 /
                                                        8145.0,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.07054193143238725,
        min_samples_leaf=23,
        learning_rate=0.12336530854190006,
        max_iter=1999,
        n_iter_no_change=None,
    )
    # scaler = StandardScaler()
    # scaler.fit(X_train)
    # train_scaled = scaler.transform(X_train)
    # test_scaled = scaler.transform(X_test)
    # scalery = StandardScaler()
    # scalery.fit(y_train)
    # y_Train = scalery.transform(y_train)
    # y_Test = scalery.transform(y_test)
    gb.fit(X_train, y_train)
    y_preds = gb.predict(X_test)
    r2 = r2_score(y_test, y_preds, multioutput='uniform_average')
    print(r2)
Exemple #7
0
def test_pre_binned_data():
    # Make sure that:
    # - training on numerical data and predicting on numerical data is the
    #   same as training on binned data and predicting on binned data
    # - training on numerical data and predicting on numerical data is the
    #   same as training on numerical data and predicting on binned data
    # - training on binned data and predicting on numerical data is not
    #   possible.

    X, y = make_regression(random_state=0)
    gbdt = GradientBoostingRegressor(scoring=None, random_state=0)
    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)

    fit_num_pred_num = gbdt.fit(X, y).predict(X)
    fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned)
    fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned)

    assert_allclose(fit_num_pred_num, fit_binned_pred_binned)
    assert_allclose(fit_num_pred_num, fit_num_pred_binned)

    assert_raises_regex(
        ValueError,
        'This estimator was fitted with pre-binned data ',
        gbdt.fit(X_binned, y).predict, X
    )
Exemple #8
0
def test_early_stopping_regression(scoring, validation_split,
                                   n_iter_no_change, tol):

    print('what')
    max_iter = 500

    X, y = make_regression(random_state=0)

    gb = GradientBoostingRegressor(verbose=1,  # just for coverage
                                   scoring=scoring,
                                   tol=tol,
                                   validation_split=validation_split,
                                   max_iter=max_iter,
                                   n_iter_no_change=n_iter_no_change,
                                   random_state=0)
    gb.fit(X, y)
    print(gb.predict(X))

    if n_iter_no_change is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Exemple #9
0
                check.func is estimator_checks.check_classifiers_train):
            continue  # same, wrapped in a functools.partial object.

        try:
            check(name, estimator)
        except SkipTest as exception:
            # the only SkipTest thrown currently results from not
            # being able to import pandas.
            warnings.warn(str(exception), SkipTestWarning)


@pytest.mark.skipif(
    int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1,
    reason="Potentially long")
@pytest.mark.parametrize('Estimator', (
    GradientBoostingRegressor(),
    GradientBoostingClassifier(n_iter_no_change=None, min_samples_leaf=5),))
def test_estimator_checks(Estimator):
    # Run the check_estimator() test suite on GBRegressor and GBClassifier.

    # Notes:
    # - Can't do early stopping with classifier because often
    #   validation_split=.1 leads to test_size=2 < n_classes and
    #   train_test_split raises an error.
    # - Also, need to set a low min_samples_leaf for
    #   check_classifiers_classes() to pass: with only 30 samples on the
    #   dataset, the root is never split with min_samples_leaf=20 and only the
    #   majority class is predicted.
    custom_check_estimator(Estimator)