Exemple #1
0
class TestGPULinear:
    @given(parameter_strategy, strategies.integers(10, 50),
           tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_coordinate(self, param, num_rounds, dataset):
        assume(len(dataset.y) > 0)
        param['updater'] = 'gpu_coord_descent'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(),
                              num_rounds)['train'][dataset.metric]
        note(result)
        assert tm.non_increasing(result)

    # Loss is not guaranteed to always decrease because of regularisation parameters
    # We test a weaker condition that the loss has not increased between the first and last
    # iteration
    @given(parameter_strategy, strategies.integers(10, 50),
           tm.dataset_strategy, strategies.floats(1e-5, 1.0),
           strategies.floats(1e-5, 1.0))
    @settings(deadline=None)
    def test_gpu_coordinate_regularised(self, param, num_rounds, dataset,
                                        alpha, lambd):
        assume(len(dataset.y) > 0)
        param['updater'] = 'gpu_coord_descent'
        param['alpha'] = alpha
        param['lambda'] = lambd
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(),
                              num_rounds)['train'][dataset.metric]
        note(result)
        assert tm.non_increasing([result[0], result[-1]])

    @pytest.mark.skipif(**tm.no_cupy())
    def test_gpu_coordinate_from_cupy(self):
        # Training linear model is quite expensive, so we don't include it in
        # test_from_cupy.py
        import cupy
        params = {
            'booster': 'gblinear',
            'updater': 'gpu_coord_descent',
            'n_estimators': 100
        }
        X, y = tm.get_california_housing()
        cpu_model = xgb.XGBRegressor(**params)
        cpu_model.fit(X, y)
        cpu_predt = cpu_model.predict(X)

        X = cupy.array(X)
        y = cupy.array(y)
        gpu_model = xgb.XGBRegressor(**params)
        gpu_model.fit(X, y)
        gpu_predt = gpu_model.predict(X)
        cupy.testing.assert_allclose(cpu_predt, gpu_predt)
Exemple #2
0
class TestDeviceQuantileDMatrix(unittest.TestCase):
    def test_dmatrix_numpy_init(self):
        data = np.random.randn(5, 5)
        with pytest.raises(AssertionError,
                           match='is not supported for DeviceQuantileDMatrix'):
            dm = xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dmatrix_cupy_init(self):
        import cupy as cp
        data = cp.random.randn(5, 5)
        dm = xgb.DeviceQuantileDMatrix(data, cp.ones(5, dtype=np.float64))
Exemple #3
0
class TestFromArrayInterface:
    '''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cupy())
    def test_simple_dmat_from_cupy(self):
        _test_from_cupy(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_device_dmat_from_cupy(self):
        _test_from_cupy(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_device_dmat(self):
        _test_cupy_training(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_simple_dmat(self):
        _test_cupy_training(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_simple_dmat(self):
        _test_cupy_metainfo(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_device_dmat(self):
        _test_cupy_metainfo(xgb.DeviceQuantileDMatrix)
Exemple #4
0
class TestFromCupy:
    '''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cupy())
    def test_simple_dmat_from_cupy(self):
        _test_from_cupy(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_device_dmat_from_cupy(self):
        _test_from_cupy(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_device_dmat(self):
        _test_cupy_training(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_simple_dmat(self):
        _test_cupy_training(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_simple_dmat(self):
        _test_cupy_metainfo(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_device_dmat(self):
        _test_cupy_metainfo(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dlpack_simple_dmat(self):
        import cupy as cp
        n = 100
        X = cp.random.random((n, 2))
        xgb.DMatrix(X.toDlpack())

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dlpack_device_dmat(self):
        import cupy as cp
        n = 100
        X = cp.random.random((n, 2))
        xgb.DeviceQuantileDMatrix(X.toDlpack())
Exemple #5
0
class TestDistributedGPU(unittest.TestCase):
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                run_with_dask_dataframe(dxgb.DaskDMatrix, client)
                run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)

    @given(parameter_strategy, strategies.integers(1, 20),
           tm.dataset_strategy)
    @settings(deadline=duration(seconds=120))
    @pytest.mark.mgpu
    def test_gpu_hist(self, params, num_rounds, dataset):
        with LocalCUDACluster(n_workers=2) as cluster:
            with Client(cluster) as client:
                run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
                             client)
                run_gpu_hist(params, num_rounds, dataset,
                             dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_dask_array(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                run_with_dask_array(dxgb.DaskDMatrix, client)
                run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_empty_dmatrix(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                parameters = {'tree_method': 'gpu_hist',
                              'debug_synchronize': True}
                run_empty_dmatrix_reg(client, parameters)
                run_empty_dmatrix_cls(client, parameters)

    def run_quantile(self, name):
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")

        exe = None
        for possible_path in {'./testxgboost', './build/testxgboost',
                              '../build/testxgboost', '../gpu-build/testxgboost'}:
            if os.path.exists(possible_path):
                exe = possible_path
        assert exe, 'No testxgboost executable found.'
        test = "--gtest_filter=GPUQuantile." + name

        def runit(worker_addr, rabit_args):
            port = None
            # setup environment for running the c++ part.
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port = arg.decode('utf-8')
            port = port.split('=')
            env = os.environ.copy()
            env[port[0]] = port[1]
            return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)

        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                workers = list(dxgb._get_client_workers(client).keys())
                rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
                futures = client.map(runit,
                                     workers,
                                     pure=False,
                                     workers=workers,
                                     rabit_args=rabit_args)
                results = client.gather(futures)
                for ret in results:
                    msg = ret.stdout.decode('utf-8')
                    assert msg.find('1 test from GPUQuantile') != -1, msg
                    assert ret.returncode == 0, msg

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_basic(self):
        self.run_quantile('AllReduceBasic')

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_same_on_all_workers(self):
        self.run_quantile('SameOnAllWorkers')
Exemple #6
0
        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1

        X = X["f0"]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)

        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1


@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.skipif(**tm.no_pandas())
def test_cudf_training_with_sklearn():
    from cudf import DataFrame as df
    from cudf import Series as ss
    import pandas as pd
    np.random.seed(1)
    X = pd.DataFrame(np.random.randn(50, 10))
    y = pd.DataFrame((np.random.randn(50) > 0).astype(np.int8))
    weights = np.random.random(50) + 1.0
    cudf_weights = df.from_pandas(pd.DataFrame(weights))
    base_margin = np.random.random(50)
    cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))

    X_cudf = df.from_pandas(X)
Exemple #7
0
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None)
    def test_shap(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20)
    def test_shap_interactions(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)
class TestGPUUpdaters:
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param["tree_method"] = "gpu_hist"
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_invalid_categorical(self):
        import cupy as cp
        rng = np.random.default_rng()
        X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)

        # Check is performe during sketching.
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": "gpu_hist"}, Xy)

        X, y = cp.array(X), cp.array(y)
        with pytest.raises(ValueError):
            Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
class TestDistributedGPU:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_dataframe(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_with_dask_dataframe(dxgb.DaskDMatrix, client)
            run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)

    @given(
        params=parameter_strategy,
        num_rounds=strategies.integers(1, 20),
        dataset=tm.dataset_strategy,
    )
    @settings(deadline=duration(seconds=120), suppress_health_check=suppress)
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize(
        "local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
    )
    @pytest.mark.mgpu
    def test_gpu_hist(
        self,
        params: Dict,
        num_rounds: int,
        dataset: tm.TestDataset,
        local_cuda_cluster: LocalCUDACluster,
    ) -> None:
        with Client(local_cuda_cluster) as client:
            run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client)
            run_gpu_hist(
                params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client
            )

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_array(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_with_dask_array(dxgb.DaskDMatrix, client)
            run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    def test_early_stopping(self, local_cuda_cluster: LocalCUDACluster) -> None:
        from sklearn.datasets import load_breast_cancer
        with Client(local_cuda_cluster) as client:
            X, y = load_breast_cancer(return_X_y=True)
            X, y = da.from_array(X), da.from_array(y)

            m = dxgb.DaskDMatrix(client, X, y)

            valid = dxgb.DaskDMatrix(client, X, y)
            early_stopping_rounds = 5
            booster = dxgb.train(client, {'objective': 'binary:logistic',
                                          'eval_metric': 'error',
                                          'tree_method': 'gpu_hist'}, m,
                                 evals=[(valid, 'Valid')],
                                 num_boost_round=1000,
                                 early_stopping_rounds=early_stopping_rounds)[
                                     'booster']
            assert hasattr(booster, 'best_score')
            dump = booster.get_dump(dump_format='json')
            assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

            valid_X = X
            valid_y = y
            cls = dxgb.DaskXGBClassifier(objective='binary:logistic',
                                         tree_method='gpu_hist',
                                         n_estimators=100)
            cls.client = client
            cls.fit(X, y, early_stopping_rounds=early_stopping_rounds,
                    eval_set=[(valid_X, valid_y)])
            booster = cls.get_booster()
            dump = booster.get_dump(dump_format='json')
            assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.parametrize("model", ["boosting"])
    def test_dask_classifier(
        self, model: str, local_cuda_cluster: LocalCUDACluster
    ) -> None:
        import dask_cudf
        with Client(local_cuda_cluster) as client:
            X_, y_, w_ = generate_array(with_weights=True)
            y_ = (y_ * 10).astype(np.int32)
            X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
            y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
            w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
            run_dask_classifier(X, y, w, model, client, 10)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_empty_dmatrix(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            parameters = {'tree_method': 'gpu_hist',
                          'debug_synchronize': True}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)

    def test_empty_dmatrix_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            n_workers = len(_get_client_workers(client))
            run_empty_dmatrix_auc(client, "gpu_hist", n_workers)

    def test_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_auc(client, "gpu_hist")

    def test_data_initialization(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            X, y, _ = generate_array()
            fw = da.random.random((random_cols, ))
            fw = fw - fw.min()
            m = dxgb.DaskDMatrix(client, X, y, feature_weights=fw)

            workers = _get_client_workers(client)
            rabit_args = client.sync(dxgb._get_rabit_args, len(workers), client)

            def worker_fn(worker_addr: str, data_ref: Dict) -> None:
                with dxgb.RabitContext(rabit_args):
                    local_dtrain = dxgb._dmatrix_from_list_of_parts(**data_ref)
                    fw_rows = local_dtrain.get_float_info("feature_weights").shape[0]
                    assert fw_rows == local_dtrain.num_col()

            futures = []
            for i in range(len(workers)):
                futures.append(
                    client.submit(
                        worker_fn,
                        workers[i],
                        m._create_fn_args(workers[i]),
                        pure=False,
                        workers=[workers[i]]
                    )
                )
            client.gather(futures)

    def test_interface_consistency(self) -> None:
        sig = OrderedDict(signature(dxgb.DaskDMatrix).parameters)
        del sig["client"]
        ddm_names = list(sig.keys())
        sig = OrderedDict(signature(dxgb.DaskDeviceQuantileDMatrix).parameters)
        del sig["client"]
        del sig["max_bin"]
        ddqdm_names = list(sig.keys())
        assert len(ddm_names) == len(ddqdm_names)

        # between dask
        for i in range(len(ddm_names)):
            assert ddm_names[i] == ddqdm_names[i]

        sig = OrderedDict(signature(xgb.DMatrix).parameters)
        del sig["nthread"]      # no nthread in dask
        dm_names = list(sig.keys())
        sig = OrderedDict(signature(xgb.DeviceQuantileDMatrix).parameters)
        del sig["nthread"]
        del sig["max_bin"]
        dqdm_names = list(sig.keys())

        # between single node
        assert len(dm_names) == len(dqdm_names)
        for i in range(len(dm_names)):
            assert dm_names[i] == dqdm_names[i]

        # ddm <-> dm
        for i in range(len(ddm_names)):
            assert ddm_names[i] == dm_names[i]

        # dqdm <-> ddqdm
        for i in range(len(ddqdm_names)):
            assert ddqdm_names[i] == dqdm_names[i]

        sig = OrderedDict(signature(xgb.XGBRanker.fit).parameters)
        ranker_names = list(sig.keys())
        sig = OrderedDict(signature(xgb.dask.DaskXGBRanker.fit).parameters)
        dranker_names = list(sig.keys())

        for rn, drn in zip(ranker_names, dranker_names):
            assert rn == drn

    def run_quantile(self, name: str, local_cuda_cluster: LocalCUDACluster) -> None:
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")

        exe = None
        for possible_path in {'./testxgboost', './build/testxgboost',
                              '../build/testxgboost', '../gpu-build/testxgboost'}:
            if os.path.exists(possible_path):
                exe = possible_path
        assert exe, 'No testxgboost executable found.'
        test = "--gtest_filter=GPUQuantile." + name

        def runit(
            worker_addr: str, rabit_args: List[bytes]
        ) -> subprocess.CompletedProcess:
            port_env = ''
            # setup environment for running the c++ part.
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port_env = arg.decode('utf-8')
            port = port_env.split('=')
            env = os.environ.copy()
            env[port[0]] = port[1]
            return subprocess.run([str(exe), test], env=env, stdout=subprocess.PIPE)

        with Client(local_cuda_cluster) as client:
            workers = _get_client_workers(client)
            rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
            futures = client.map(runit,
                                 workers,
                                 pure=False,
                                 workers=workers,
                                 rabit_args=rabit_args)
            results = client.gather(futures)
            for ret in results:
                msg = ret.stdout.decode('utf-8')
                assert msg.find('1 test from GPUQuantile') != -1, msg
                assert ret.returncode == 0, msg

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_basic(self, local_cuda_cluster: LocalCUDACluster) -> None:
        self.run_quantile('AllReduceBasic', local_cuda_cluster)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_same_on_all_workers(
        self, local_cuda_cluster: LocalCUDACluster
    ) -> None:
        self.run_quantile('SameOnAllWorkers', local_cuda_cluster)
class TestGPUUpdaters:
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        external_result = train_result(param, dataset.get_external_dmat(),
                                       num_rounds)
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
Exemple #11
0
class TestFromCupy:
    '''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cupy())
    def test_simple_dmat_from_cupy(self):
        _test_from_cupy(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_device_dmat_from_cupy(self):
        _test_from_cupy(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_device_dmat(self):
        _test_cupy_training(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training_simple_dmat(self):
        _test_cupy_training(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_simple_dmat(self):
        _test_cupy_metainfo(xgb.DMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo_device_dmat(self):
        _test_cupy_metainfo(xgb.DeviceQuantileDMatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dlpack_simple_dmat(self):
        import cupy as cp
        n = 100
        X = cp.random.random((n, 2))
        xgb.DMatrix(X.toDlpack())

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dlpack_device_dmat(self):
        import cupy as cp
        n = 100
        X = cp.random.random((n, 2))
        m = xgb.DeviceQuantileDMatrix(X.toDlpack())
        with pytest.raises(xgb.core.XGBoostError):
            m.slice(rindex=[0, 1, 2])

    @pytest.mark.skipif(**tm.no_cupy())
    def test_qid(self):
        import cupy as cp
        rng = cp.random.RandomState(1994)
        rows = 100
        cols = 10
        X, y = rng.randn(rows, cols), rng.randn(rows)
        qid = rng.randint(low=0, high=10, size=rows, dtype=np.uint32)
        qid = cp.sort(qid)

        Xy = xgb.DMatrix(X, y)
        Xy.set_info(qid=qid)
        group_ptr = Xy.get_uint_info('group_ptr')
        assert group_ptr[0] == 0
        assert group_ptr[-1] == rows

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_specified_device(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        dtrain = dmatrix_from_cupy(np.float32, xgb.DeviceQuantileDMatrix,
                                   np.nan)
        with pytest.raises(xgb.core.XGBoostError):
            xgb.train({
                'tree_method': 'gpu_hist',
                'gpu_id': 1
            },
                      dtrain,
                      num_boost_round=10)
Exemple #12
0
class TestDistributedGPU:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_dataframe(self,
                            local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_with_dask_dataframe(dxgb.DaskDMatrix, client)
            run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)

    @given(params=parameter_strategy,
           num_rounds=strategies.integers(1, 20),
           dataset=tm.dataset_strategy)
    @settings(deadline=duration(seconds=120))
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.parametrize('local_cuda_cluster', [{
        'n_workers': 2
    }],
                             indirect=['local_cuda_cluster'])
    @pytest.mark.mgpu
    def test_gpu_hist(self, params: Dict, num_rounds: int,
                      dataset: tm.TestDataset,
                      local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client)
            run_gpu_hist(params, num_rounds, dataset,
                         dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_array(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            run_with_dask_array(dxgb.DaskDMatrix, client)
            run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    def test_early_stopping(self,
                            local_cuda_cluster: LocalCUDACluster) -> None:
        from sklearn.datasets import load_breast_cancer
        with Client(local_cuda_cluster) as client:
            X, y = load_breast_cancer(return_X_y=True)
            X, y = da.from_array(X), da.from_array(y)

            m = dxgb.DaskDMatrix(client, X, y)

            valid = dxgb.DaskDMatrix(client, X, y)
            early_stopping_rounds = 5
            booster = dxgb.train(
                client, {
                    'objective': 'binary:logistic',
                    'eval_metric': 'error',
                    'tree_method': 'gpu_hist'
                },
                m,
                evals=[(valid, 'Valid')],
                num_boost_round=1000,
                early_stopping_rounds=early_stopping_rounds)['booster']
            assert hasattr(booster, 'best_score')
            dump = booster.get_dump(dump_format='json')
            assert len(
                dump) - booster.best_iteration == early_stopping_rounds + 1

            valid_X = X
            valid_y = y
            cls = dxgb.DaskXGBClassifier(objective='binary:logistic',
                                         tree_method='gpu_hist',
                                         n_estimators=100)
            cls.client = client
            cls.fit(X,
                    y,
                    early_stopping_rounds=early_stopping_rounds,
                    eval_set=[(valid_X, valid_y)])
            booster = cls.get_booster()
            dump = booster.get_dump(dump_format='json')
            assert len(
                dump) - booster.best_iteration == early_stopping_rounds + 1

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_empty_dmatrix(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            parameters = {'tree_method': 'gpu_hist', 'debug_synchronize': True}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)

    def run_quantile(self, name: str,
                     local_cuda_cluster: LocalCUDACluster) -> None:
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")

        exe = None
        for possible_path in {
                './testxgboost', './build/testxgboost', '../build/testxgboost',
                '../gpu-build/testxgboost'
        }:
            if os.path.exists(possible_path):
                exe = possible_path
        assert exe, 'No testxgboost executable found.'
        test = "--gtest_filter=GPUQuantile." + name

        def runit(worker_addr: str,
                  rabit_args: List[bytes]) -> subprocess.CompletedProcess:
            port_env = ''
            # setup environment for running the c++ part.
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port_env = arg.decode('utf-8')
            port = port_env.split('=')
            env = os.environ.copy()
            env[port[0]] = port[1]
            return subprocess.run([str(exe), test],
                                  env=env,
                                  stdout=subprocess.PIPE)

        with Client(local_cuda_cluster) as client:
            workers = list(_get_client_workers(client).keys())
            rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
            futures = client.map(runit,
                                 workers,
                                 pure=False,
                                 workers=workers,
                                 rabit_args=rabit_args)
            results = client.gather(futures)
            for ret in results:
                msg = ret.stdout.decode('utf-8')
                assert msg.find('1 test from GPUQuantile') != -1, msg
                assert ret.returncode == 0, msg

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_basic(self,
                            local_cuda_cluster: LocalCUDACluster) -> None:
        self.run_quantile('AllReduceBasic', local_cuda_cluster)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_same_on_all_workers(
            self, local_cuda_cluster: LocalCUDACluster) -> None:
        self.run_quantile('SameOnAllWorkers', local_cuda_cluster)
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp
        dtrain.set_info(base_margin=base_margin)
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        missing = 11  # set to integer for testing

        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)

        X = cp.random.randn(rows, cols)
        missing_idx = [i for i in range(0, cols, 4)]
        X[:, missing_idx] = missing  # set to be missing
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)

        test = xgb.DMatrix(X[:10, ...], missing=missing)
        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   missing=missing)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

        base_margin = cp_rng.randn(rows)
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

        # Create a wide dataset
        X = cp_rng.randn(100, 10000)
        y = cp_rng.randn(100)

        missing_idx = [i for i in range(0, X.shape[1], 16)]
        X[:, missing_idx] = missing
        reg = xgb.XGBRegressor(tree_method="gpu_hist",
                               n_estimators=8,
                               missing=missing)
        reg.fit(X, y)

        gpu_predt = reg.predict(X)
        reg.set_params(predictor="cpu_predictor")
        cpu_predt = reg.predict(X)
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            # column major array
            inplace_predt = booster.inplace_predict(x.values)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            assert cp.all(copied_predt == inplace_predt)

            inplace_predt = booster.inplace_predict(x)
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

        base_margin = cudf.Series(rng.randn(rows))
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, print_blob=True)
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20, print_blob=True)
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "gpu_hist"},
                            Xy,
                            num_boost_round=10)

        booster.set_param({"predictor": "gpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

        booster.set_param({"predictor": "cpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None, print_blob=True)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize("n_classes", [2, 3])
    def test_predict_dart(self, n_classes):
        from sklearn.datasets import make_classification
        import cupy as cp
        n_samples = 1000
        X_, y_ = make_classification(n_samples=n_samples,
                                     n_informative=5,
                                     n_classes=n_classes)
        X, y = cp.array(X_), cp.array(y_)

        Xy = xgb.DMatrix(X, y)
        if n_classes == 2:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "binary:logistic"
            }
        else:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
                "num_class": n_classes
            }

        booster = xgb.train(params, Xy, num_boost_round=32)
        # predictor=auto
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)
        cpu_inplace = booster.inplace_predict(X_)
        booster.set_param({"predictor": "cpu_predictor"})
        cpu_copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
        cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        booster.set_param({"predictor": "gpu_predictor"})
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dtypes(self):
        import cupy as cp
        rows = 1000
        cols = 10
        rng = cp.random.RandomState(1994)
        orig = rng.randint(low=0, high=127,
                           size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)

        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
                cp.signedinteger,
                cp.byte,
                cp.short,
                cp.intc,
                cp.int_,
                cp.longlong,
                cp.unsignedinteger,
                cp.ubyte,
                cp.ushort,
                cp.uintc,
                cp.uint,
                cp.ulonglong,
                cp.floating,
                cp.half,
                cp.single,
                cp.double,
        ]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # boolean
        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
        predt_orig = booster.inplace_predict(orig)
        for dtype in [cp.bool8, cp.bool_]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # unsupported types
        for dtype in [
                cp.complex64,
                cp.complex128,
        ]:
            X = cp.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                booster.inplace_predict(X)
class TestGPUPredict(unittest.TestCase):
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

    @given(strategies.integers(1, 200), tm.dataset_strategy,
           shap_parameter_strategy, strategies.booleans())
    @settings(deadline=None)
    def test_shap(self, num_rounds, dataset, param, all_rows):
        if param['max_depth'] == 0 and param['max_leaves'] == 0:
            return

        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        if all_rows:
            test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                    dataset.margin)
        else:
            test_dmat = xgb.DMatrix(dataset.X[0:1, :])
        shap = bst.predict(test_dmat, pred_contribs=True)
        bst.set_param({"predictor": "cpu_predictor"})
        cpu_shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assert np.allclose(shap, cpu_shap, 1e-3, 1e-3)
        # feature contributions should add up to predictions
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)
class TestDistributedGPU(unittest.TestCase):
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=4)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 4

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

                series_predictions = dxgb.inplace_predict(client, out, X)
                assert isinstance(series_predictions, dd.Series)
                series_predictions = series_predictions.compute()

                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))

                cp.testing.assert_allclose(single_node, predictions)
                np.testing.assert_allclose(single_node,
                                           series_predictions.to_array())

                predt = dxgb.predict(client, out, X)
                assert isinstance(predt, dd.Series)

                def is_df(part):
                    assert isinstance(part, cudf.DataFrame), part
                    return part

                predt.map_partitions(is_df,
                                     meta=dd.utils.make_meta(
                                         {'prediction': 'f4'}))

                cp.testing.assert_allclose(predt.values.compute(), single_node)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_dask_array(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = X.map_blocks(cp.asarray)
                y = y.map_blocks(cp.asarray)
                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=2)
                from_dmatrix = dxgb.predict(client, out, dtrain).compute()
                inplace_predictions = dxgb.inplace_predict(client, out,
                                                           X).compute()
                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))
                np.testing.assert_allclose(single_node, from_dmatrix)
                device = cp.cuda.runtime.getDevice()
                assert device == inplace_predictions.device.id
                single_node = cp.array(single_node)
                assert device == single_node.device.id
                cp.testing.assert_allclose(single_node, inplace_predictions)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_empty_dmatrix(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                parameters = {'tree_method': 'gpu_hist'}
                run_empty_dmatrix(client, parameters)
Exemple #16
0
class TestDistributedGPU(unittest.TestCase):
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {
                    'tree_method': 'gpu_hist',
                    'debug_synchronize': True
                },
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=4)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 4

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

                series_predictions = dxgb.inplace_predict(client, out, X)
                assert isinstance(series_predictions, dd.Series)
                series_predictions = series_predictions.compute()

                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))

                cp.testing.assert_allclose(single_node, predictions)
                np.testing.assert_allclose(single_node,
                                           series_predictions.to_array())

                predt = dxgb.predict(client, out, X)
                assert isinstance(predt, dd.Series)

                def is_df(part):
                    assert isinstance(part, cudf.DataFrame), part
                    return part

                predt.map_partitions(is_df,
                                     meta=dd.utils.make_meta(
                                         {'prediction': 'f4'}))

                cp.testing.assert_allclose(predt.values.compute(), single_node)

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    @pytest.mark.mgpu
    def test_gpu_hist(self, params, num_rounds, dataset):
        with LocalCUDACluster(n_workers=2) as cluster:
            with Client(cluster) as client:
                params['tree_method'] = 'gpu_hist'
                params = dataset.set_params(params)
                # multi class doesn't handle empty dataset well (empty
                # means at least 1 worker has data).
                if params['objective'] == "multi:softmax":
                    return
                # It doesn't make sense to distribute a completely
                # empty dataset.
                if dataset.X.shape[0] == 0:
                    return

                chunk = 128
                X = da.from_array(dataset.X,
                                  chunks=(chunk, dataset.X.shape[1]))
                y = da.from_array(dataset.y, chunks=(chunk, ))
                if dataset.w is not None:
                    w = da.from_array(dataset.w, chunks=(chunk, ))
                else:
                    w = None

                m = dxgb.DaskDMatrix(client, data=X, label=y, weight=w)
                history = dxgb.train(client,
                                     params=params,
                                     dtrain=m,
                                     num_boost_round=num_rounds,
                                     evals=[(m, 'train')])['history']
                note(history)
                assert tm.non_increasing(history['train'][dataset.metric])

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_dask_array(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = X.map_blocks(cp.asarray)
                y = y.map_blocks(cp.asarray)
                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {
                    'tree_method': 'gpu_hist',
                    'debug_synchronize': True
                },
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=2)
                from_dmatrix = dxgb.predict(client, out, dtrain).compute()
                inplace_predictions = dxgb.inplace_predict(client, out,
                                                           X).compute()
                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))
                np.testing.assert_allclose(single_node, from_dmatrix)
                device = cp.cuda.runtime.getDevice()
                assert device == inplace_predictions.device.id
                single_node = cp.array(single_node)
                assert device == single_node.device.id
                cp.testing.assert_allclose(single_node, inplace_predictions)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    def test_empty_dmatrix(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                parameters = {
                    'tree_method': 'gpu_hist',
                    'debug_synchronize': True
                }
                run_empty_dmatrix(client, parameters)

    def run_quantile(self, name):
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")

        exe = None
        for possible_path in {
                './testxgboost', './build/testxgboost', '../build/testxgboost',
                '../gpu-build/testxgboost'
        }:
            if os.path.exists(possible_path):
                exe = possible_path
        assert exe, 'No testxgboost executable found.'
        test = "--gtest_filter=GPUQuantile." + name

        def runit(worker_addr, rabit_args):
            port = None
            # setup environment for running the c++ part.
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port = arg.decode('utf-8')
            port = port.split('=')
            env = os.environ.copy()
            env[port[0]] = port[1]
            return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)

        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                workers = list(dxgb._get_client_workers(client).keys())
                rabit_args = dxgb._get_rabit_args(workers, client)
                futures = client.map(runit,
                                     workers,
                                     pure=False,
                                     workers=workers,
                                     rabit_args=rabit_args)
                results = client.gather(futures)
                for ret in results:
                    msg = ret.stdout.decode('utf-8')
                    assert msg.find('1 test from GPUQuantile') != -1, msg
                    assert ret.returncode == 0, msg

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_basic(self):
        self.run_quantile('AllReduceBasic')

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.mgpu
    @pytest.mark.gtest
    def test_quantile_same_on_all_workers(self):
        self.run_quantile('SameOnAllWorkers')
Exemple #17
0
class TestGPUUpdaters:
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
        import pandas as pd
        rng = np.random.RandomState(1994)

        pd_dict = {}
        for i in range(cols):
            c = rng.randint(low=0, high=cats + 1, size=rows)
            pd_dict[str(i)] = pd.Series(c, dtype=np.int64)

        df = pd.DataFrame(pd_dict)
        label = df.iloc[:, 0]
        for i in range(0, cols - 1):
            label += df.iloc[:, i]
        label += 1
        df = df.astype('category')
        onehot = pd.get_dummies(df)
        cat = df

        by_etl_results = {}
        by_builtin_results = {}

        parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}

        m = xgb.DMatrix(onehot, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_etl_results)

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_builtin_results)
        np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']),
                                   np.array(
                                       by_builtin_results['Train']['rmse']),
                                   rtol=1e-3)
        assert tm.non_increasing(by_builtin_results['Train']['rmse'])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 5), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        pytest.xfail(reason='TestGPUUpdaters::test_categorical is flaky')
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.run_categorical_basic(rows, cols, rounds, cats)

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
Exemple #18
0
class TestFromArrayInterface:
    '''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
    @pytest.mark.skipif(**tm.no_cupy())
    def test_from_cupy(self):
        '''Test constructing DMatrix from cupy'''
        import cupy as cp
        dmatrix_from_cupy(np.float32, np.NAN)
        dmatrix_from_cupy(np.float64, np.NAN)

        dmatrix_from_cupy(np.uint8, 2)
        dmatrix_from_cupy(np.uint32, 3)
        dmatrix_from_cupy(np.uint64, 4)

        dmatrix_from_cupy(np.int8, 2)
        dmatrix_from_cupy(np.int32, -2)
        dmatrix_from_cupy(np.int64, -3)

        with pytest.raises(Exception):
            X = cp.random.randn(2, 2, dtype="float32")
            dtrain = xgb.DMatrix(X, label=X)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_training(self):
        import cupy as cp
        np.random.seed(1)
        cp.random.seed(1)
        X = cp.random.randn(50, 10, dtype="float32")
        y = cp.random.randn(50, dtype="float32")
        weights = np.random.random(50) + 1
        cupy_weights = cp.array(weights)
        base_margin = np.random.random(50)
        cupy_base_margin = cp.array(base_margin)

        evals_result_cupy = {}
        dtrain_cp = xgb.DMatrix(X,
                                y,
                                weight=cupy_weights,
                                base_margin=cupy_base_margin)
        params = {'gpu_id': 0, 'nthread': 1}
        xgb.train(params,
                  dtrain_cp,
                  evals=[(dtrain_cp, "train")],
                  evals_result=evals_result_cupy)
        evals_result_np = {}
        dtrain_np = xgb.DMatrix(cp.asnumpy(X),
                                cp.asnumpy(y),
                                weight=weights,
                                base_margin=base_margin)
        xgb.train(params,
                  dtrain_np,
                  evals=[(dtrain_np, "train")],
                  evals_result=evals_result_np)
        assert np.array_equal(evals_result_cupy["train"]["rmse"],
                              evals_result_np["train"]["rmse"])

    @pytest.mark.skipif(**tm.no_cupy())
    def test_cupy_metainfo(self):
        import cupy as cp
        n = 100
        X = np.random.random((n, 2))
        dmat_cupy = xgb.DMatrix(X)
        dmat = xgb.DMatrix(X)
        floats = np.random.random(n)
        uints = np.array([4, 2, 8]).astype("uint32")
        cupy_floats = cp.array(floats)
        cupy_uints = cp.array(uints)
        dmat.set_float_info('weight', floats)
        dmat.set_float_info('label', floats)
        dmat.set_float_info('base_margin', floats)
        dmat.set_uint_info('group', uints)
        dmat_cupy.set_interface_info('weight', cupy_floats)
        dmat_cupy.set_interface_info('label', cupy_floats)
        dmat_cupy.set_interface_info('base_margin', cupy_floats)
        dmat_cupy.set_interface_info('group', cupy_uints)

        # Test setting info with cupy
        assert np.array_equal(dmat.get_float_info('weight'),
                              dmat_cupy.get_float_info('weight'))
        assert np.array_equal(dmat.get_float_info('label'),
                              dmat_cupy.get_float_info('label'))
        assert np.array_equal(dmat.get_float_info('base_margin'),
                              dmat_cupy.get_float_info('base_margin'))
        assert np.array_equal(dmat.get_uint_info('group_ptr'),
                              dmat_cupy.get_uint_info('group_ptr'))
Exemple #19
0
class TestGPUPredict(unittest.TestCase):
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param, dtrain, iterations, evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {'tree_method': 'gpu_hist',
                  'predictor': 'cpu_predictor',
                  'n_jobs': -1,
                  'seed': 123}
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain, num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)

        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain, num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)
class TestGPUUpdaters:
    cputest = test_up.TestTreeMethod()

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param["tree_method"] = "gpu_hist"
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        self.cputest.run_categorical_basic(rows, cols, rounds, cats,
                                           "gpu_hist")

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.cputest.run_categorical_basic(rows, cols, rounds, cats,
                                           "gpu_hist")

    @pytest.mark.skipif(**tm.no_cupy())
    def test_invalid_categorical(self):
        self.cputest.run_invalid_category("gpu_hist")

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])