def test_categorical_data_subset(mock_fit, mock_predict, mock_predict_proba,
                                 X_y_binary):
    X = pd.DataFrame({
        "feature_1": [0, 0, 1, 1, 0, 1],
        "feature_2": ["a", "a", "b", "b", "c", "c"]
    })
    y = pd.Series([1, 1, 0, 0, 0, 1])
    X_expected = pd.DataFrame({
        0: [0, 0, 1, 1, 0, 1],
        1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]
    })
    X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype('category')

    X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]})
    X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]})
    X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:,
                                                          1].astype('category')

    clf = LightGBMClassifier()
    clf.fit(X, y)
    arg_X = mock_fit.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)

    # determine whether predict and predict_proba perform as expected with the subset of categorical data
    clf.predict(X_subset)
    arg_X = mock_predict.call_args[0][0]
    assert_frame_equal(X_expected_subset, arg_X)

    clf.predict_proba(X_subset)
    arg_X = mock_predict_proba.call_args[0][0]
    assert_frame_equal(X_expected_subset, arg_X)
def test_multiple_fit(mock_fit, mock_predict, mock_predict_proba):
    y = pd.Series([1] * 4)
    X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]})
    X1_fit_expected = pd.DataFrame({0: [0.0, 1.0, 2.0, 2.0]}, dtype='category')
    X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]})
    X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]},
                                       dtype='category')

    clf = LightGBMClassifier()
    clf.fit(X1_fit, y)
    assert_frame_equal(X1_fit_expected, mock_fit.call_args[0][0])
    clf.predict(X1_predict)
    assert_frame_equal(X1_predict_expected, mock_predict.call_args[0][0])
    clf.predict_proba(X1_predict)
    assert_frame_equal(X1_predict_expected, mock_predict_proba.call_args[0][0])

    # Check if it will fit a different dataset with new variable
    X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]})
    X2_fit_expected = pd.DataFrame({0: [2.0, 1.0, 0.0, 3.0]}, dtype='category')
    X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]})
    X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]},
                                       dtype='category')

    clf = LightGBMClassifier()
    clf.fit(X2_fit, y)
    assert_frame_equal(X2_fit_expected, mock_fit.call_args[0][0])
    clf.predict(X2_predict)
    assert_frame_equal(X2_predict_expected, mock_predict.call_args[0][0])
    clf.predict_proba(X2_predict)
    assert_frame_equal(X2_predict_expected, mock_predict_proba.call_args[0][0])
def test_correct_args(mock_fit, mock_predict, mock_predict_proba, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)

    # add object (string) and categorical data.
    X['string_col'] = 'abc'
    X['string_col'].iloc[len(X) // 2:] = 'cba'
    X['categorical_data'] = 'square'
    X['categorical_data'].iloc[len(X) // 2:] = 'circle'
    X['categorical_data'] = X['categorical_data'].astype('category')

    # create the expected result, which is a dataframe with int values in the categorical column and dtype=category
    X_expected = X.copy()
    X_expected = X_expected.replace(["abc", "cba"], [0.0, 1.0])
    X_expected = X_expected.replace(["square", "circle"], [1.0, 0.0])
    X_expected[['string_col', 'categorical_data'
                ]] = X_expected[['string_col',
                                 'categorical_data']].astype('category')

    # rename the columns to be the indices
    X_expected.columns = np.arange(X_expected.shape[1])

    clf = LightGBMClassifier()
    clf.fit(X, y)
    arg_X = mock_fit.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)

    clf.predict(X)
    arg_X = mock_predict.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)

    clf.predict_proba(X)
    arg_X = mock_predict_proba.call_args[0][0]
    assert_frame_equal(X_expected, arg_X)
def test_binary_label_encoding(mock_predict, X_y_binary):
    X, y = X_y_binary
    y_numeric = pd.Series(y, dtype='int64')
    y_alpha = pd.Series(y_numeric.copy().replace({0: "no", 1: "yes"}))
    clf = LightGBMClassifier()
    clf.fit(X, y_alpha)
    clf.predict(X)

    y_float = pd.Series(y_numeric.copy().replace({0: 0.99, 1: 1.01}))
    clf.fit(X, y_float)
    clf.predict(X)
def test_multiclass_label(mock_predict, X_y_multi):
    X, y = X_y_multi
    y_numeric = pd.Series(y, dtype='int64')
    y_alpha = pd.Series(y_numeric.copy().replace({
        0: "alright",
        1: "better",
        2: "great"
    }))

    clf = LightGBMClassifier()
    clf.fit(X, y_alpha)
    clf.predict(X)
def test_fit_no_categories(mock_predict, mock_predict_proba, X_y_binary):
    X, y = X_y_binary
    X2 = pd.DataFrame(X)
    X2.columns = np.arange(len(X2.columns))
    clf = LightGBMClassifier(n_jobs=1)
    clf.fit(X, y)

    clf.predict(X)
    arg_X = mock_predict.call_args[0][0]
    np.testing.assert_array_equal(arg_X, X2)

    clf.predict_proba(X)
    arg_X = mock_predict_proba.call_args[0][0]
    np.testing.assert_array_equal(arg_X, X2)
def test_multiclass_label(mock_fit, mock_predict, X_y_multi):
    X, y = X_y_multi
    y_numeric = pd.Series(y, dtype='int64')
    y_alpha = pd.Series(y_numeric.copy().replace({
        0: "alright",
        1: "better",
        2: "great"
    }))

    clf = LightGBMClassifier()
    clf.fit(X, y_alpha)
    y_arg = mock_fit.call_args[0][1]
    assert_series_equal(y_arg, y_numeric)

    clf.predict(X)
def test_fit_string_features(X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    X['string_col'] = 'abc'

    # lightGBM requires input args to be int, float, or bool, not string
    X_expected = X.copy()
    X_expected['string_col'] = 0.0

    clf = lgbm.sklearn.LGBMClassifier(random_state=0, n_jobs=1)
    clf.fit(X_expected, y, categorical_feature=['string_col'])
    y_pred_sk = clf.predict(X_expected)
    y_pred_proba_sk = clf.predict_proba(X_expected)

    clf = LightGBMClassifier(n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)

    np.testing.assert_almost_equal(y_pred_sk,
                                   y_pred.to_series().values,
                                   decimal=5)
    np.testing.assert_almost_equal(y_pred_proba_sk,
                                   y_pred_proba.to_dataframe().values,
                                   decimal=5)
def test_fit_predict_multi(X_y_multi):
    X, y = X_y_multi

    clf = lgbm.sklearn.LGBMClassifier(random_state=0)
    clf.fit(X, y)
    y_pred_sk = clf.predict(X)
    y_pred_proba_sk = clf.predict_proba(X)

    clf = LightGBMClassifier()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)

    np.testing.assert_almost_equal(y_pred, y_pred_sk, decimal=5)
    np.testing.assert_almost_equal(y_pred_proba, y_pred_proba_sk, decimal=5)
def test_fit_predict_binary(X_y_binary):
    X, y = X_y_binary

    sk_clf = lgbm.sklearn.LGBMClassifier(random_state=0)
    sk_clf.fit(X, y)
    y_pred_sk = sk_clf.predict(X)
    y_pred_proba_sk = sk_clf.predict_proba(X)

    clf = LightGBMClassifier()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)

    np.testing.assert_almost_equal(y_pred_sk,
                                   y_pred.to_series().values,
                                   decimal=5)
    np.testing.assert_almost_equal(y_pred_proba_sk,
                                   y_pred_proba.to_dataframe().values,
                                   decimal=5)
def test_lightgbm_multiindex(data_type, X_y_binary, make_data_type):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    categorical_col = pd.Series([1] * int(len(X[0]) / 2) +
                                [0] * int(len(X[0]) - len(X[0]) / 2),
                                dtype='category')
    X['cat'] = categorical_col
    col_names = [('column_{}'.format(num), '{}'.format(num))
                 for num in range(len(X.columns))]
    X.columns = pd.MultiIndex.from_tuples(col_names)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    clf = LightGBMClassifier()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)
    assert not y_pred.to_series().isnull().values.any()
    assert not y_pred_proba.to_dataframe().isnull().values.any().any()