Ejemplo n.º 1
0
def test_feature_importance(X_y_regression):
    X, y = X_y_regression

    clf = LightGBMRegressor()
    sk_clf = lgbm.sklearn.LGBMRegressor(n_estimators=20, random_state=0)
    sk_clf.fit(X, y)
    sk_feature_importance = sk_clf.feature_importances_

    clf.fit(X, y)
    feature_importance = clf.feature_importance

    np.testing.assert_almost_equal(sk_feature_importance, feature_importance, decimal=3)
Ejemplo n.º 2
0
def test_fit_predict_regression(X_y_regression):
    X, y = X_y_regression

    sk_clf = lgbm.sklearn.LGBMRegressor(n_estimators=20, random_state=0)
    sk_clf.fit(X, y)
    y_pred_sk = sk_clf.predict(X)

    clf = LightGBMRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)

    np.testing.assert_almost_equal(y_pred_sk, y_pred.to_series().values, decimal=5)
Ejemplo n.º 3
0
def test_lightgbm_multiindex(data_type, X_y_regression, make_data_type):
    X, y = X_y_regression
    X = pd.DataFrame(X)
    col_names = [('column_{}'.format(num), '{}'.format(num))
                 for num in range(len(X.columns))]
    X.columns = pd.MultiIndex.from_tuples(col_names)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    clf = LightGBMRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert not y_pred.to_series().isnull().values.any()
Ejemplo n.º 4
0
def test_fit_string_features(X_y_regression):
    X, y = X_y_regression
    X = pd.DataFrame(X)
    X['string_col'] = 'abc'

    # lightGBM requires input args to be int, float, or bool, not string
    X_expected = X.copy()
    X_expected['string_col'] = 0.0

    clf = lgbm.sklearn.LGBMRegressor(n_estimators=20, random_state=0)
    clf.fit(X_expected, y, categorical_feature=['string_col'])
    y_pred_sk = clf.predict(X_expected)

    clf = LightGBMRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)

    np.testing.assert_almost_equal(y_pred_sk, y_pred.to_series().values, decimal=5)
Ejemplo n.º 5
0
def test_regression_rf(X_y_regression):
    X, y = X_y_regression

    with pytest.raises(lgbm.basic.LightGBMError, match="bagging_fraction"):
        clf = LightGBMRegressor(boosting_type="rf", bagging_freq=1, bagging_fraction=1.01)
        clf.fit(X, y)

    clf = LightGBMRegressor(boosting_type="rf", bagging_freq=0)
    clf.fit(X, y)
    assert clf.parameters['bagging_freq'] == 0
    assert clf.parameters['bagging_fraction'] == 0.9
Ejemplo n.º 6
0
def test_correct_args(mock_predict, X_y_regression):
    X, y = X_y_regression
    X = pd.DataFrame(X)

    # add object (string) and categorical data.
    X['string_col'] = 'abc'
    X['string_col'].iloc[len(X) // 2:] = 'cba'
    X['categorical_data'] = 'square'
    X['categorical_data'].iloc[len(X) // 2:] = 'circle'
    X['categorical_data'] = X['categorical_data'].astype('category')

    # create the expected result, which is a dataframe with int values in the categorical column and dtype=category
    X_expected = X.copy()
    X_expected = X_expected.replace(["abc", "cba"], [0.0, 1.0])
    X_expected = X_expected.replace(["square", "circle"], [1.0, 0.0])
    X_expected[['string_col', 'categorical_data'
                ]] = X_expected[['string_col',
                                 'categorical_data']].astype('category')

    # rename the columns to be the indices
    X_expected.columns = np.arange(X_expected.shape[1])

    clf = LightGBMRegressor()
    clf.fit(X, y)

    clf.predict(X)
    arg_X = mock_predict.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)
Ejemplo n.º 7
0
def test_categorical_data_subset(mock_fit, mock_predict, X_y_regression):
    X = pd.DataFrame({
        "feature_1": [0, 0, 1, 1, 0, 1],
        "feature_2": ["a", "a", "b", "b", "c", "c"]
    })
    y = pd.Series([1, 1, 0, 0, 0, 1])
    X_expected = pd.DataFrame({
        0: [0, 0, 1, 1, 0, 1],
        1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]
    })
    X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype('category')

    X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]})
    X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]})
    X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:,
                                                          1].astype('category')

    clf = LightGBMRegressor()
    clf.fit(X, y)
    arg_X = mock_fit.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)

    # determine whether predict and predict_proba perform as expected with the subset of categorical data
    clf.predict(X_subset)
    arg_X = mock_predict.call_args[0][0]
    assert_frame_equal(X_expected_subset, arg_X)
Ejemplo n.º 8
0
def test_lightgbm_regressor_random_state_bounds_seed(X_y_regression):
    """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds"""
    X, y = X_y_regression
    col_names = ["col_{}".format(i) for i in range(len(X[0]))]
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y)
    clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_state=SEED_BOUNDS.min_bound)
    fitted = clf.fit(X, y)
    assert isinstance(fitted, LightGBMRegressor)
    clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_state=SEED_BOUNDS.max_bound)
    clf.fit(X, y)
Ejemplo n.º 9
0
def test_lightgbm_regressor_random_state_bounds_rng(X_y_regression):
    """when a RNG is inputted for random_state, ensure the sample we take to get a random seed for lightgbm is in lightgbm's supported range"""
    def make_mock_random_state(return_value):
        class MockRandomState(np.random.RandomState):
            def randint(self, min_bound, max_bound):
                return return_value

        return MockRandomState()

    X, y = X_y_regression
    col_names = ["col_{}".format(i) for i in range(len(X[0]))]
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y)
    rng = make_mock_random_state(LightGBMRegressor.SEED_MIN)
    clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_state=rng)
    clf.fit(X, y)
    rng = make_mock_random_state(LightGBMRegressor.SEED_MAX)
    clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_state=rng)
    clf.fit(X, y)
Ejemplo n.º 10
0
def test_regression_goss(X_y_regression):
    X, y = X_y_regression
    clf = LightGBMRegressor(boosting_type="goss")
    clf.fit(X, y)
    assert clf.parameters['bagging_freq'] == 0
    assert clf.parameters['bagging_fraction'] == 0.9
Ejemplo n.º 11
0
def test_multiple_fit(mock_predict):
    y = pd.Series([1] * 4)
    X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]})
    X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]})
    X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]},
                                       dtype='category')

    clf = LightGBMRegressor()
    clf.fit(X1_fit, y)
    clf.predict(X1_predict)
    assert_frame_equal(X1_predict_expected, mock_predict.call_args[0][0])

    # Check if it will fit a different dataset with new variable
    X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]})
    X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]})
    X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]},
                                       dtype='category')

    clf = LightGBMRegressor()
    clf.fit(X2_fit, y)
    clf.predict(X2_predict)
    assert_frame_equal(X2_predict_expected, mock_predict.call_args[0][0])