def test_multiclass_classifier(loop): # noqa # data iris = load_iris() X, y = iris.data, iris.target dX = da.from_array(X, 5) dy = da.from_array(y, 5) df = pd.DataFrame(X, columns=iris.feature_names) labels = pd.Series(y, name="target") ddf = dd.from_pandas(df, 2) dlabels = dd.from_pandas(labels, 2) # model a = xgb.XGBClassifier() # array b = dxgb.XGBClassifier() c = xgb.XGBClassifier() # frame d = dxgb.XGBClassifier() with cluster() as (s, [_, _]): with Client(s["address"], loop=loop): # fit a.fit(X, y) # array b.fit(dX, dy, classes=[0, 1, 2]) c.fit(df, labels) # frame d.fit(ddf, dlabels, classes=[0, 1, 2]) # check da.utils.assert_eq(a.predict(X), b.predict(dX)) da.utils.assert_eq(a.predict_proba(X), b.predict_proba(dX)) da.utils.assert_eq(c.predict(df), d.predict(ddf)) da.utils.assert_eq(c.predict_proba(df), d.predict_proba(ddf))
async def test_predict_proba(c, s, a, b): X = da.random.random((50, 2), chunks=25) y = da.random.randint(0, 2, size=50, chunks=25) X_ = await c.compute(X) # array clf = dxgb.XGBClassifier() clf.fit(X, y, classes=[0, 1]) booster = await clf._Booster result = clf.predict_proba(X_) expected = booster.predict(xgb.DMatrix(X_)) np.testing.assert_array_equal(result, expected) # dataframe XX = dd.from_dask_array(X, columns=['A', 'B']) yy = dd.from_dask_array(y) XX_ = await c.compute(XX) clf = dxgb.XGBClassifier() clf.fit(XX, yy, classes=[0, 1]) booster = await clf._Booster result = clf.predict_proba(XX_) expected = booster.predict(xgb.DMatrix(XX_)) np.testing.assert_array_equal(result, expected)
def test_classifier_multi(kind, loop): # noqa: F811 if kind == "array": X2 = da.from_array(X, 5) y2 = da.from_array(np.array([0, 1, 2, 0, 1, 2, 0, 0, 0, 1]), chunks=5) else: X2 = dd.from_pandas(df, npartitions=2) y2 = dd.from_pandas(labels, npartitions=2) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): a = dxgb.XGBClassifier(num_class=3, n_estimators=10, objective="multi:softprob") a.fit(X2, y2) p1 = a.predict(X2) assert dask.is_dask_collection(p1) if kind == "array": assert p1.shape == (10, ) result = p1.compute() assert result.shape == (10, ) # proba p2 = a.predict_proba(X2) assert dask.is_dask_collection(p2) if kind == "array": assert p2.shape == (10, 3) assert p2.compute().shape == (10, 3)
def test_validation_weights_xgbclassifier(loop): # noqa from sklearn.datasets import make_hastie_10_2 # prepare training and test data X, y = make_hastie_10_2(n_samples=2000, random_state=42) labels, y = np.unique(y, return_inverse=True) param_dist = { "objective": "binary:logistic", "n_estimators": 2, "random_state": 123, } with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): X_train, X_test = X[:1600], X[1600:] y_train, y_test = y[:1600], y[1600:] dX_train = da.from_array(X_train) dy_train = da.from_array(y_train) # instantiate model clf = dxgb.XGBClassifier(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) weights_train = da.from_array(weights_train) clf.fit( dX_train, dy_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], eval_metric="logloss", ) # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights[ "validation_0"]["logloss"] # now use weights for the test set np.random.seed(0) weights_test = np.random.choice([1, 2], len(X_test)) clf.fit( dX_train, dy_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[weights_test], eval_metric="logloss", ) evals_result_with_weights = clf.evals_result() logloss_with_weights = evals_result_with_weights["validation_0"][ "logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
def __init__(self, client, random_seed=42, n_jobs=20, verbose=True): super(DaskModel, self).__init__(random_seed, n_jobs, verbose) # Model fields self.model = dask_xgboost.XGBClassifier() self.client = client self.scoring = DaskModel._acc_score self.tts = dask_ml.model_selection.train_test_split
def test_classifier_different_chunks(loop): # noqa with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): a = dxgb.XGBClassifier() X2 = da.from_array(X, 5) y2 = da.from_array(y, 4) with pytest.raises(ValueError): a.fit(X2, y2)
def test_classifier_early_stopping(loop): # noqa # data digits = load_digits(2) X = digits["data"] y = digits["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) dX_train = da.from_array(X_train) dy_train = da.from_array(y_train) clf1 = dxgb.XGBClassifier() clf2 = dxgb.XGBClassifier() clf3 = dxgb.XGBClassifier() with cluster() as (s, [_, _]): with Client(s["address"], loop=loop): clf1.fit( dX_train, dy_train, early_stopping_rounds=5, eval_metric="auc", eval_set=[(X_test, y_test)], ) clf2.fit( dX_train, dy_train, early_stopping_rounds=4, eval_metric="auc", eval_set=[(X_test, y_test)], ) # should be the same assert clf1.best_score == clf2.best_score assert clf1.best_score != 1 # check overfit clf3.fit( dX_train, dy_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)], ) assert clf3.best_score == 1
def test_classifier_evals_result(loop): # noqa with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): a = dxgb.XGBClassifier() X2 = da.from_array(X, 5) y2 = da.from_array(y, 5) a.fit(X2, y2, eval_metric="rmse", eval_set=[(X, y)]) evals_result = a.evals_result() b = xgb.XGBClassifier() b.fit(X, y, eval_metric="rmse", eval_set=[(X, y)]) assert_eq(evals_result, b.evals_result())
def test_classifier(loop): # noqa with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): a = dxgb.XGBClassifier() X2 = da.from_array(X, 5) y2 = da.from_array(y, 5) a.fit(X2, y2) p1 = a.predict(X2) b = xgb.XGBClassifier() b.fit(X, y) np.testing.assert_array_almost_equal(a.feature_importances_, b.feature_importances_) assert_eq(p1, b.predict(X))
def test_classifier(loop): # noqa digits = load_digits(2) X = digits["data"] y = digits["target"] with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): a = dxgb.XGBClassifier() X2 = da.from_array(X) y2 = da.from_array(y) a.fit(X2, y2) p1 = a.predict(X2) b = xgb.XGBClassifier() b.fit(X, y) np.testing.assert_array_almost_equal(a.feature_importances_, b.feature_importances_) assert_eq(p1, b.predict(X))