Example #1
0
def test_xgb_with_regression_datasets(data, num_actors, modin_type_y):
    dataset, param = data
    num_round = 10

    X_df = pd.DataFrame(dataset.data)
    y_df = modin_type_y(dataset.target)
    X_train, X_test = train_test_split(X_df)
    y_train, y_test = train_test_split(y_df)

    train_xgb_dmatrix = xgboost.DMatrix(X_train, label=y_train)
    test_xgb_dmatrix = xgboost.DMatrix(X_test, label=y_test)

    train_mxgb_dmatrix = xgb.DMatrix(X_train, label=y_train)
    test_mxgb_dmatrix = xgb.DMatrix(X_test, label=y_test)

    evals_result_xgb = {}
    evals_result_mxgb = {}
    verbose_eval = False
    bst = xgboost.train(
        param,
        train_xgb_dmatrix,
        num_round,
        evals_result=evals_result_xgb,
        evals=[(train_xgb_dmatrix, "train"), (test_xgb_dmatrix, "test")],
        verbose_eval=verbose_eval,
    )
    modin_bst = xgb.train(
        param,
        train_mxgb_dmatrix,
        num_round,
        evals_result=evals_result_mxgb,
        evals=[(train_mxgb_dmatrix, "train"), (test_mxgb_dmatrix, "test")],
        num_actors=num_actors,
        verbose_eval=verbose_eval,
    )

    for param in ["train", "test"]:
        assert len(evals_result_xgb[param]["rmse"]) == len(
            evals_result_mxgb[param]["rmse"])
        for i in range(len(evals_result_xgb[param]["rmse"])):
            np.testing.assert_allclose(
                evals_result_xgb[param]["rmse"][i],
                evals_result_mxgb[param]["rmse"][i],
                rtol=0.0007,
            )

    predictions = bst.predict(train_xgb_dmatrix)
    modin_predictions = modin_bst.predict(train_mxgb_dmatrix)

    val = mean_squared_error(y_train, predictions)
    modin_val = mean_squared_error(y_train, modin_predictions)

    np.testing.assert_allclose(val, modin_val, rtol=1.25e-05)
Example #2
0
def test_xgb_with_multiclass_classification_datasets(data, num_actors,
                                                     modin_type_y):
    dataset, param_ = data
    num_round = 10
    part_param = {"objective": "multi:softprob", "eval_metric": "mlogloss"}
    param = {**param_, **part_param}

    X = dataset.data
    y = dataset.target
    xgb_dmatrix = xgboost.DMatrix(X, label=y)

    modin_X = pd.DataFrame(X)
    modin_y = modin_type_y(y)
    mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y)

    evals_result_xgb = {}
    evals_result_mxgb = {}
    verbose_eval = False
    bst = xgboost.train(
        param,
        xgb_dmatrix,
        num_round,
        evals_result=evals_result_xgb,
        evals=[(xgb_dmatrix, "train")],
        verbose_eval=verbose_eval,
    )
    modin_bst = xgb.train(
        param,
        mxgb_dmatrix,
        num_round,
        evals_result=evals_result_mxgb,
        evals=[(mxgb_dmatrix, "train")],
        num_actors=num_actors,
        verbose_eval=verbose_eval,
    )

    assert len(evals_result_xgb["train"]["mlogloss"]) == len(
        evals_result_mxgb["train"]["mlogloss"])
    for i in range(len(evals_result_xgb["train"]["mlogloss"])):
        np.testing.assert_allclose(
            evals_result_xgb["train"]["mlogloss"][i],
            evals_result_mxgb["train"]["mlogloss"][i],
            atol=0.009,
        )

    predictions = bst.predict(xgb_dmatrix)
    modin_predictions = modin_bst.predict(mxgb_dmatrix)

    array_preds = np.asarray([np.argmax(line) for line in predictions])
    modin_array_preds = np.asarray(
        [np.argmax(line) for line in modin_predictions.to_numpy()])

    val = accuracy_score(y, array_preds)
    modin_val = accuracy_score(modin_y, modin_array_preds)

    np.testing.assert_allclose(val, modin_val)
Example #3
0
def test_feature_names():
    dataset = load_breast_cancer()
    X = dataset.data
    y = dataset.target
    feature_names = [f"feat{i}" for i in range(X.shape[1])]

    check_dmatrix(
        X,
        y,
        feature_names=feature_names,
    )

    dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names)
    md_dmatrix = mxgb.DMatrix(pd.DataFrame(X),
                              label=pd.Series(y),
                              feature_names=feature_names)

    params = {
        "objective": "binary:logistic",
        "eval_metric": "mlogloss",
    }

    booster = xgb.train(params, dmatrix, num_boost_round=10)
    md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10)

    predictions = booster.predict(dmatrix)
    modin_predictions = md_booster.predict(md_dmatrix)

    preds = pandas.DataFrame(predictions).apply(np.round, axis=0)
    modin_preds = modin_predictions.apply(np.round, axis=0)

    accuracy = accuracy_score(y, preds)
    md_accuracy = accuracy_score(y, modin_preds)

    np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002)

    # Different feature_names (default) must raise error in this case
    dm = xgb.DMatrix(X)
    md_dm = mxgb.DMatrix(pd.DataFrame(X))
    with pytest.raises(ValueError):
        booster.predict(dm)
    with pytest.raises(ValueError):
        repr(md_booster.predict(md_dm))
Example #4
0
def test_xgb_with_binary_classification_datasets(data, num_actors,
                                                 modin_type_y):
    dataset, param = data
    num_round = 10

    X = dataset.data
    y = dataset.target
    xgb_dmatrix = xgboost.DMatrix(X, label=y)

    modin_X = pd.DataFrame(X)
    modin_y = modin_type_y(y)
    mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y)

    evals_result_xgb = {}
    evals_result_mxgb = {}
    verbose_eval = False
    bst = xgboost.train(
        param,
        xgb_dmatrix,
        num_round,
        evals_result=evals_result_xgb,
        evals=[(xgb_dmatrix, "train")],
        verbose_eval=verbose_eval,
    )
    modin_bst = xgb.train(
        param,
        mxgb_dmatrix,
        num_round,
        evals_result=evals_result_mxgb,
        evals=[(mxgb_dmatrix, "train")],
        num_actors=num_actors,
        verbose_eval=verbose_eval,
    )

    for par in param["eval_metric"]:
        assert len(evals_result_xgb["train"][par]) == len(
            evals_result_xgb["train"][par])
        for i in range(len(evals_result_xgb["train"][par])):
            np.testing.assert_allclose(
                evals_result_xgb["train"][par][i],
                evals_result_mxgb["train"][par][i],
                atol=0.011,
            )

    predictions = bst.predict(xgb_dmatrix)
    modin_predictions = modin_bst.predict(mxgb_dmatrix)

    preds = pd.DataFrame(predictions).apply(lambda x: round(x))
    modin_preds = modin_predictions.apply(lambda x: round(x))

    val = accuracy_score(y, preds)
    modin_val = accuracy_score(modin_y, modin_preds)

    np.testing.assert_allclose(val, modin_val, atol=0.002, rtol=0.002)
Example #5
0
def check_dmatrix(data, label=None, **kwargs):
    modin_data = pd.DataFrame(data)
    modin_label = label if label is None else pd.Series(label)
    try:
        dm = xgb.DMatrix(data, label=label, **kwargs)
    except Exception as xgb_exception:
        with pytest.raises(Exception) as mxgb_exception:
            mxgb.DMatrix(modin_data, label=modin_label, **kwargs)
        # Thrown exceptions are `XGBoostError`, which is a descendant of `ValueError`, and `ValueError`
        # for XGBoost and Modin, respectively,  so we intentionally use `xgb_exception`
        # as a first parameter of `isinstance` to pass the assertion
        assert isinstance(
            xgb_exception, type(mxgb_exception.value)
        ), "Got Modin Exception type {}, but xgboost Exception type {} was expected".format(
            type(mxgb_exception.value), type(xgb_exception))
    else:
        md_dm = mxgb.DMatrix(modin_data, label=modin_label, **kwargs)
        assert md_dm.num_row() == dm.num_row()
        assert md_dm.num_col() == dm.num_col()
        assert md_dm.feature_names == dm.feature_names
        assert md_dm.feature_types == dm.feature_types
Example #6
0
def test_invalid_input():
    list_df = [[1, 2.0, True], [2, 3.0, False]]
    with pytest.raises(AssertionError):
        # Check that DMatrix uses only DataFrame
        xgb.DMatrix(list_df, label=pd.Series([1, 2]))

    param = {}
    num_round = 2
    with pytest.raises(AssertionError):
        # Check that train uses only DMatrix
        xgb.train(param, list_df, num_round)

    df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]],
                      columns=["a", "b", "c"])
    modin_dtrain = xgb.DMatrix(df, label=pd.Series([1, 2]))

    modin_bst = xgb.train(param, modin_dtrain, num_round)

    dt = [[1, 2.0, 3.3], [2, 3.0, 4.4]]

    with pytest.raises(AssertionError):
        # Check that predict uses only DMatrix
        modin_bst.predict(dt)
Example #7
0
def test_feature_weights():
    n_rows = 10
    n_cols = 50
    fw = rng.uniform(size=n_cols)
    X = rng.randn(n_rows, n_cols)
    dm = xgb.DMatrix(X)
    md_dm = mxgb.DMatrix(pd.DataFrame(X))
    dm.set_info(feature_weights=fw)
    md_dm.set_info(feature_weights=fw)
    np.testing.assert_allclose(dm.get_float_info("feature_weights"),
                               md_dm.get_float_info("feature_weights"))
    # Handle empty
    dm.set_info(feature_weights=np.empty((0, )))
    md_dm.set_info(feature_weights=np.empty((0, )))

    assert (dm.get_float_info("feature_weights").shape[0] ==
            md_dm.get_float_info("feature_weights").shape[0] == 0)
Example #8
0
def test_backend():
    try:
        xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0])))
    except ValueError:
        pass