class TestPlotting:
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self):
        X, y = tm.make_categorical(1000, 31, 19, onehot=False)
        reg = xgb.XGBRegressor(enable_categorical=True,
                               n_estimators=10,
                               tree_method="gpu_hist")
        reg.fit(X, y)
        trees = reg.get_booster().get_dump(dump_format="json")
        for tree in trees:
            j_tree = json.loads(tree)
            assert "leaf" in j_tree.keys() or isinstance(
                j_tree["split_condition"], list)

        graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1)
        assert isinstance(graph, Source)
        ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1)
        assert isinstance(ax, Axes)
class TestGPUUpdaters:
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param["tree_method"] = "gpu_hist"
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_invalid_categorical(self):
        import cupy as cp
        rng = np.random.default_rng()
        X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)

        # Check is performe during sketching.
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": "gpu_hist"}, Xy)

        X, y = cp.array(X), cp.array(y)
        with pytest.raises(ValueError):
            Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
Example #3
0
class TestDMatrix:
    def test_warn_missing(self):
        from xgboost import data
        with pytest.warns(UserWarning):
            data._warn_unused_missing('uri', 4)

        with pytest.warns(None) as record:
            data._warn_unused_missing('uri', None)
            data._warn_unused_missing('uri', np.nan)

            assert len(record) == 0

        with pytest.warns(None) as record:
            x = rng.randn(10, 10)
            y = rng.randn(10)

            xgb.DMatrix(x, y, missing=4)

            assert len(record) == 0

        with pytest.warns(UserWarning):
            csr = csr_matrix(x)
            xgb.DMatrix(csr.tocsc(), y, missing=4)

    def test_dmatrix_numpy_init(self):
        data = np.random.randn(5, 5)
        dm = xgb.DMatrix(data)
        assert dm.num_row() == 5
        assert dm.num_col() == 5

        data = np.array([[1, 2], [3, 4]])
        dm = xgb.DMatrix(data)
        assert dm.num_row() == 2
        assert dm.num_col() == 2

        # 0d array
        with pytest.raises(ValueError):
            xgb.DMatrix(np.array(1))
        # 1d array
        with pytest.raises(ValueError):
            xgb.DMatrix(np.array([1, 2, 3]))
        # 3d array
        data = np.random.randn(5, 5, 5)
        with pytest.raises(ValueError):
            xgb.DMatrix(data)
        # object dtype
        data = np.array([['a', 'b'], ['c', 'd']])
        with pytest.raises(ValueError):
            xgb.DMatrix(data)

    def test_csr(self):
        indptr = np.array([0, 2, 3, 6])
        indices = np.array([0, 2, 2, 0, 1, 2])
        data = np.array([1, 2, 3, 4, 5, 6])
        X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
        dtrain = xgb.DMatrix(X)
        assert dtrain.num_row() == 3
        assert dtrain.num_col() == 3

    def test_csc(self):
        row = np.array([0, 2, 2, 0, 1, 2])
        col = np.array([0, 0, 1, 2, 2, 2])
        data = np.array([1, 2, 3, 4, 5, 6])
        X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
        dtrain = xgb.DMatrix(X)
        assert dtrain.num_row() == 3
        assert dtrain.num_col() == 3

    def test_coo(self):
        row = np.array([0, 2, 2, 0, 1, 2])
        col = np.array([0, 0, 1, 2, 2, 2])
        data = np.array([1, 2, 3, 4, 5, 6])
        X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
        dtrain = xgb.DMatrix(X)
        assert dtrain.num_row() == 3
        assert dtrain.num_col() == 3

    def test_np_view(self):
        # Sliced Float32 array
        y = np.array([12, 34, 56], np.float32)[::2]
        from_view = xgb.DMatrix(np.array([[]]), label=y).get_label()
        from_array = xgb.DMatrix(np.array([[]]), label=y + 0).get_label()
        assert (from_view.shape == from_array.shape)
        assert (from_view == from_array).all()

        # Sliced UInt array
        z = np.array([12, 34, 56], np.uint32)[::2]
        dmat = xgb.DMatrix(np.array([[]]))
        dmat.set_uint_info('group', z)
        from_view = dmat.get_uint_info('group_ptr')
        dmat = xgb.DMatrix(np.array([[]]))
        dmat.set_uint_info('group', z + 0)
        from_array = dmat.get_uint_info('group_ptr')
        assert (from_view.shape == from_array.shape)
        assert (from_view == from_array).all()

    def test_slice(self):
        X = rng.randn(100, 100)
        y = rng.randint(low=0, high=3, size=100).astype(np.float32)
        d = xgb.DMatrix(X, y)
        np.testing.assert_equal(d.get_label(), y)

        fw = rng.uniform(size=100).astype(np.float32)
        d.set_info(feature_weights=fw)

        # base margin is per-class in multi-class classifier
        base_margin = rng.randn(100, 3).astype(np.float32)
        d.set_base_margin(base_margin.flatten())

        ridxs = [1, 2, 3, 4, 5, 6]
        sliced = d.slice(ridxs)

        # Slicing works with label and other meta info fields
        np.testing.assert_equal(sliced.get_label(), y[1:7])
        np.testing.assert_equal(sliced.get_float_info('feature_weights'), fw)
        np.testing.assert_equal(sliced.get_base_margin(),
                                base_margin[1:7, :].flatten())
        np.testing.assert_equal(sliced.get_base_margin(),
                                sliced.get_float_info('base_margin'))

        # Slicing a DMatrix results into a DMatrix that's equivalent to a DMatrix that's
        # constructed from the corresponding NumPy slice
        d2 = xgb.DMatrix(X[1:7, :], y[1:7])
        d2.set_base_margin(base_margin[1:7, :].flatten())
        eval_res = {}
        _ = xgb.train(
            {
                'num_class': 3,
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss'
            },
            d,
            num_boost_round=2,
            evals=[(d2, 'd2'), (sliced, 'sliced')],
            evals_result=eval_res)
        np.testing.assert_equal(eval_res['d2']['mlogloss'],
                                eval_res['sliced']['mlogloss'])

        ridxs_arr = np.array(ridxs)[1:]  # handles numpy slice correctly
        sliced = d.slice(ridxs_arr)
        np.testing.assert_equal(sliced.get_label(), y[2:7])

    def test_feature_names_slice(self):
        data = np.random.randn(5, 5)

        # different length
        with pytest.raises(ValueError):
            xgb.DMatrix(data, feature_names=list('abcdef'))
        # contains duplicates
        with pytest.raises(ValueError):
            xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'd'])
        # contains symbol
        with pytest.raises(ValueError):
            xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'e<1'])

        dm = xgb.DMatrix(data)
        dm.feature_names = list('abcde')
        assert dm.feature_names == list('abcde')

        assert dm.slice([0, 1]).num_col() == dm.num_col()
        assert dm.slice([0, 1]).feature_names == dm.feature_names

        dm.feature_types = 'q'
        assert dm.feature_types == list('qqqqq')

        dm.feature_types = list('qiqiq')
        assert dm.feature_types == list('qiqiq')

        with pytest.raises(ValueError):
            dm.feature_types = list('abcde')

        # reset
        dm.feature_names = None
        assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
        assert dm.feature_types is None

    def test_feature_names(self):
        data = np.random.randn(100, 5)
        target = np.array([0, 1] * 50)

        cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
                 [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']]

        for features in cases:
            dm = xgb.DMatrix(data, label=target, feature_names=features)
            assert dm.feature_names == features
            assert dm.num_row() == 100
            assert dm.num_col() == 5

            params = {
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss',
                'eta': 0.3,
                'num_class': 3
            }

            bst = xgb.train(params, dm, num_boost_round=10)
            scores = bst.get_fscore()
            assert list(sorted(k for k in scores)) == features

            dummy = np.random.randn(5, 5)
            dm = xgb.DMatrix(dummy, feature_names=features)
            bst.predict(dm)

            # different feature name must raises error
            dm = xgb.DMatrix(dummy, feature_names=list('abcde'))
            with pytest.raises(ValueError):
                bst.predict(dm)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_save_binary(self):
        import pandas as pd
        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, 'm.dmatrix')
            data = pd.DataFrame({"a": [0, 1], "b": [2, 3], "c": [4, 5]})
            m0 = xgb.DMatrix(data.loc[:, ["a", "b"]], data["c"])
            assert m0.feature_names == ['a', 'b']
            m0.save_binary(path)
            m1 = xgb.DMatrix(path)
            assert m0.feature_names == m1.feature_names
            assert m0.feature_types == m1.feature_types

    def test_get_info(self):
        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        dtrain.get_float_info('label')
        dtrain.get_float_info('weight')
        dtrain.get_float_info('base_margin')
        dtrain.get_uint_info('group_ptr')

    def test_qid(self):
        rows = 100
        cols = 10
        X, y = rng.randn(rows, cols), rng.randn(rows)
        qid = rng.randint(low=0, high=10, size=rows, dtype=np.uint32)
        qid = np.sort(qid)

        Xy = xgb.DMatrix(X, y)
        Xy.set_info(qid=qid)
        group_ptr = Xy.get_uint_info('group_ptr')
        assert group_ptr[0] == 0
        assert group_ptr[-1] == rows

    def test_feature_weights(self):
        kRows = 10
        kCols = 50
        rng = np.random.RandomState(1994)
        fw = rng.uniform(size=kCols)
        X = rng.randn(kRows, kCols)
        m = xgb.DMatrix(X)
        m.set_info(feature_weights=fw)
        np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
        # Handle empty
        m.set_info(feature_weights=np.empty((0, 0)))

        assert m.get_float_info('feature_weights').shape[0] == 0

        fw -= 1

        with pytest.raises(ValueError):
            m.set_info(feature_weights=fw)

    def test_sparse_dmatrix_csr(self):
        nrow = 100
        ncol = 1000
        x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
        assert x.indices.max() < ncol - 1
        x.data[:] = 1
        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
        assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
        watchlist = [(dtrain, 'train')]
        param = {
            'max_depth': 3,
            'objective': 'binary:logistic',
            'verbosity': 0
        }
        bst = xgb.train(param, dtrain, 5, watchlist)
        bst.predict(dtrain)

        i32 = csr_matrix((x.data.astype(np.int32), x.indices, x.indptr),
                         shape=x.shape)
        f32 = csr_matrix((i32.data.astype(np.float32), x.indices, x.indptr),
                         shape=x.shape)
        di32 = xgb.DMatrix(i32)
        df32 = xgb.DMatrix(f32)
        dense = xgb.DMatrix(f32.toarray(), missing=0)
        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "f32.dmatrix")
            df32.save_binary(path)
            with open(path, "rb") as fd:
                df32_buffer = np.array(fd.read())
            path = os.path.join(tmpdir, "f32.dmatrix")
            di32.save_binary(path)
            with open(path, "rb") as fd:
                di32_buffer = np.array(fd.read())

            path = os.path.join(tmpdir, "dense.dmatrix")
            dense.save_binary(path)
            with open(path, "rb") as fd:
                dense_buffer = np.array(fd.read())

            np.testing.assert_equal(df32_buffer, di32_buffer)
            np.testing.assert_equal(df32_buffer, dense_buffer)

    def test_sparse_dmatrix_csc(self):
        nrow = 1000
        ncol = 100
        x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
        assert x.indices.max() < nrow - 1
        x.data[:] = 1
        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
        assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
        watchlist = [(dtrain, 'train')]
        param = {
            'max_depth': 3,
            'objective': 'binary:logistic',
            'verbosity': 0
        }
        bst = xgb.train(param, dtrain, 5, watchlist)
        bst.predict(dtrain)

    def test_unknown_data(self):
        class Data:
            pass

        with pytest.raises(TypeError):
            with pytest.warns(UserWarning):
                d = Data()
                xgb.DMatrix(d)
Example #4
0
class TestInplacePredict:
    '''Tests for running inplace prediction'''
    @classmethod
    def setup_class(cls):
        cls.rows = 1000
        cls.cols = 10

        cls.missing = 11            # set to integer for testing

        cls.rng = np.random.RandomState(1994)

        cls.X = cls.rng.randn(cls.rows, cls.cols)
        missing_idx = [i for i in range(0, cls.cols, 4)]
        cls.X[:, missing_idx] = cls.missing  # set to be missing

        cls.y = cls.rng.randn(cls.rows)

        dtrain = xgb.DMatrix(cls.X, cls.y)
        cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)

        cls.num_boost_round = 10
        cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)

    def test_predict(self):
        booster = self.booster
        X = self.X
        test = self.test

        predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing)
        predt_from_dmatrix = booster.predict(test)

        X_obj = X.copy().astype(object)

        assert X_obj.dtype.hasobject is True
        assert X.dtype.hasobject is False
        np.testing.assert_allclose(
            booster.inplace_predict(X_obj), booster.inplace_predict(X)
        )

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        predt_from_array = booster.inplace_predict(
            X[:10, ...], iteration_range=(0, 4), missing=self.missing
        )
        predt_from_dmatrix = booster.predict(test, ntree_limit=4)

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        with pytest.raises(ValueError):
            booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
        with pytest.raises(ValueError):
            booster.predict(test, iteration_range=(0, booster.best_iteration + 2))

        default = booster.predict(test)

        range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
        ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
        np.testing.assert_allclose(range_full, default)
        np.testing.assert_allclose(ntree_full, default)

        range_full = booster.predict(
            test, iteration_range=(0, booster.best_iteration + 1)
        )
        ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit)
        np.testing.assert_allclose(range_full, default)
        np.testing.assert_allclose(ntree_full, default)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_dense)

        def predict_csr(x):
            inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_csr)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_predict_pd(self):
        X = self.X
        # construct it in column major style
        df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])})
        booster = self.booster
        df_predt = booster.inplace_predict(df)
        arr_predt = booster.inplace_predict(X)
        dmat_predt = booster.predict(xgb.DMatrix(X))

        X = df.values
        X = np.asfortranarray(X)
        fort_predt = booster.inplace_predict(X)

        np.testing.assert_allclose(dmat_predt, arr_predt)
        np.testing.assert_allclose(df_predt, arr_predt)
        np.testing.assert_allclose(fort_predt, arr_predt)

    def test_base_margin(self):
        booster = self.booster

        base_margin = self.rng.randn(self.rows)
        from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin)

        dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        np.testing.assert_allclose(from_dmatrix, from_inplace)

    def test_dtypes(self):
        orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape(
            self.rows, self.cols
        )
        predt_orig = self.booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
            np.signedinteger,
            np.byte,
            np.short,
            np.intc,
            np.int_,
            np.longlong,
            np.unsignedinteger,
            np.ubyte,
            np.ushort,
            np.uintc,
            np.uint,
            np.ulonglong,
            np.floating,
            np.half,
            np.single,
            np.double,
        ]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)

        # boolean
        orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape(
            self.rows, self.cols
        )
        predt_orig = self.booster.inplace_predict(orig)
        for dtype in [np.bool8, np.bool_]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)

        # unsupported types
        for dtype in [
            np.string_,
            np.complex64,
            np.complex128,
        ]:
            X = np.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                self.booster.inplace_predict(X)
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp
        dtrain.set_info(base_margin=base_margin)
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        missing = 11  # set to integer for testing

        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)

        X = cp.random.randn(rows, cols)
        missing_idx = [i for i in range(0, cols, 4)]
        X[:, missing_idx] = missing  # set to be missing
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)

        test = xgb.DMatrix(X[:10, ...], missing=missing)
        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   missing=missing)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

        base_margin = cp_rng.randn(rows)
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

        # Create a wide dataset
        X = cp_rng.randn(100, 10000)
        y = cp_rng.randn(100)

        missing_idx = [i for i in range(0, X.shape[1], 16)]
        X[:, missing_idx] = missing
        reg = xgb.XGBRegressor(tree_method="gpu_hist",
                               n_estimators=8,
                               missing=missing)
        reg.fit(X, y)

        gpu_predt = reg.predict(X)
        reg.set_params(predictor="cpu_predictor")
        cpu_predt = reg.predict(X)
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            # column major array
            inplace_predt = booster.inplace_predict(x.values)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            assert cp.all(copied_predt == inplace_predt)

            inplace_predt = booster.inplace_predict(x)
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

        base_margin = cudf.Series(rng.randn(rows))
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, print_blob=True)
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20, print_blob=True)
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "gpu_hist"},
                            Xy,
                            num_boost_round=10)

        booster.set_param({"predictor": "gpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

        booster.set_param({"predictor": "cpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None, print_blob=True)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize("n_classes", [2, 3])
    def test_predict_dart(self, n_classes):
        from sklearn.datasets import make_classification
        import cupy as cp
        n_samples = 1000
        X_, y_ = make_classification(n_samples=n_samples,
                                     n_informative=5,
                                     n_classes=n_classes)
        X, y = cp.array(X_), cp.array(y_)

        Xy = xgb.DMatrix(X, y)
        if n_classes == 2:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "binary:logistic"
            }
        else:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
                "num_class": n_classes
            }

        booster = xgb.train(params, Xy, num_boost_round=32)
        # predictor=auto
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)
        cpu_inplace = booster.inplace_predict(X_)
        booster.set_param({"predictor": "cpu_predictor"})
        cpu_copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
        cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        booster.set_param({"predictor": "gpu_predictor"})
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dtypes(self):
        import cupy as cp
        rows = 1000
        cols = 10
        rng = cp.random.RandomState(1994)
        orig = rng.randint(low=0, high=127,
                           size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)

        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
                cp.signedinteger,
                cp.byte,
                cp.short,
                cp.intc,
                cp.int_,
                cp.longlong,
                cp.unsignedinteger,
                cp.ubyte,
                cp.ushort,
                cp.uintc,
                cp.uint,
                cp.ulonglong,
                cp.floating,
                cp.half,
                cp.single,
                cp.double,
        ]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # boolean
        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
        predt_orig = booster.inplace_predict(orig)
        for dtype in [cp.bool8, cp.bool_]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # unsupported types
        for dtype in [
                cp.complex64,
                cp.complex128,
        ]:
            X = cp.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                booster.inplace_predict(X)
Example #6
0
class TestTreeMethod:
    @given(exact_parameter_strategy, strategies.integers(1, 20),
           tm.dataset_strategy)
    @settings(deadline=None)
    def test_exact(self, param, num_rounds, dataset):
        param['tree_method'] = 'exact'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
        strategies.integers(1, 20),
        tm.dataset_strategy,
    )
    @settings(deadline=None)
    def test_approx(self, param, hist_param, num_rounds, dataset):
        param["tree_method"] = "approx"
        param = dataset.set_params(param)
        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_pruner(self):
        import sklearn
        params = {'tree_method': 'exact'}
        cancer = sklearn.datasets.load_breast_cancer()
        X = cancer['data']
        y = cancer["target"]

        dtrain = xgb.DMatrix(X, y)
        booster = xgb.train(params, dtrain=dtrain, num_boost_round=10)
        grown = str(booster.get_dump())

        params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'}
        booster = xgb.train(params,
                            dtrain=dtrain,
                            num_boost_round=10,
                            xgb_model=booster)
        after_prune = str(booster.get_dump())
        assert grown != after_prune

        booster = xgb.train(params,
                            dtrain=dtrain,
                            num_boost_round=10,
                            xgb_model=booster)
        second_prune = str(booster.get_dump())
        # Second prune should not change the tree
        assert after_prune == second_prune

    @given(exact_parameter_strategy, hist_parameter_strategy,
           strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_hist(self, param, hist_param, num_rounds, dataset):
        param['tree_method'] = 'hist'
        param = dataset.set_params(param)
        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    def test_hist_categorical(self):
        # hist must be same as exact on all-categorial data
        dpath = 'demo/data/'
        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        ag_param = {
            'max_depth': 2,
            'tree_method': 'hist',
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'auc'
        }
        hist_res = {}
        exact_res = {}

        xgb.train(ag_param,
                  ag_dtrain,
                  10, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=hist_res)
        ag_param["tree_method"] = "exact"
        xgb.train(ag_param,
                  ag_dtrain,
                  10, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=exact_res)
        assert hist_res['train']['auc'] == exact_res['train']['auc']
        assert hist_res['test']['auc'] == exact_res['test']['auc']

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_hist_degenerate_case(self):
        # Test a degenerate case where the quantile sketcher won't return any
        # quantile points for a particular feature (the second feature in
        # this example). Source: https://github.com/dmlc/xgboost/issues/2943
        nan = np.nan
        param = {'missing': nan, 'tree_method': 'hist'}
        model = xgb.XGBRegressor(**param)
        X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
                      [6.38888889e+05, nan], [6.28086420e+05, nan]])
        y = [1000000., 0., 0., 500000.]
        w = [0, 0, 1, 0]
        model.fit(X, y, sample_weight=w)

    def run_invalid_category(self, tree_method: str) -> None:
        rng = np.random.default_rng()
        # too large
        X = rng.integers(low=0, high=4, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)
        X[13, 7] = np.iinfo(np.int32).max + 1

        # Check is performed during sketching.
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        X[13, 7] = 16777216
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        # mixed positive and negative values
        X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)

        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        if tree_method == "gpu_hist":
            import cupy as cp

            X, y = cp.array(X), cp.array(y)
            with pytest.raises(ValueError):
                Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)

    def test_invalid_category(self) -> None:
        self.run_invalid_category("approx")

    def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
        # Use one-hot exclusively
        parameters = {
            "tree_method": tree_method,
            "predictor": predictor,
            "max_cat_to_onehot": 9999
        }

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

        by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
        parameters["max_cat_to_onehot"] = 1
        parameters["reg_lambda"] = 0
        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        rmse_oh = by_builtin_results["Train"]["rmse"]
        rmse_group = by_grouping["Train"]["rmse"]
        # always better or equal to onehot when there's no regularization.
        for a, b in zip(rmse_oh, rmse_group):
            assert a >= b

        parameters["reg_lambda"] = 1.0
        by_grouping = {}
        xgb.train(
            parameters,
            m,
            num_boost_round=32,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        self.run_categorical_basic(rows, cols, rounds, cats, "approx")
        self.run_categorical_basic(rows, cols, rounds, cats, "hist")
Example #7
0
# -*- coding: utf-8 -*-
import numpy as np
import xgboost as xgb
import testing as tm
import unittest
import pytest

try:
    import pandas as pd
except ImportError:
    pass


pytestmark = pytest.mark.skipif(**tm.no_pandas())


dpath = 'demo/data/'
rng = np.random.RandomState(1994)


class TestPandas(unittest.TestCase):

    def test_pandas(self):

        df = pd.DataFrame([[1, 2., True], [2, 3., False]],
                          columns=['a', 'b', 'c'])
        dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
        assert dm.feature_names == ['a', 'b', 'c']
        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
Example #8
0
class TestGPUUpdaters:
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
        import pandas as pd
        rng = np.random.RandomState(1994)

        pd_dict = {}
        for i in range(cols):
            c = rng.randint(low=0, high=cats + 1, size=rows)
            pd_dict[str(i)] = pd.Series(c, dtype=np.int64)

        df = pd.DataFrame(pd_dict)
        label = df.iloc[:, 0]
        for i in range(0, cols - 1):
            label += df.iloc[:, i]
        label += 1
        df = df.astype('category')
        onehot = pd.get_dummies(df)
        cat = df

        by_etl_results = {}
        by_builtin_results = {}

        parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}

        m = xgb.DMatrix(onehot, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_etl_results)

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_builtin_results)
        np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']),
                                   np.array(
                                       by_builtin_results['Train']['rmse']),
                                   rtol=1e-3)
        assert tm.non_increasing(by_builtin_results['Train']['rmse'])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 5), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        pytest.xfail(reason='TestGPUUpdaters::test_categorical is flaky')
        self.run_categorical_basic(rows, cols, rounds, cats)

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.run_categorical_basic(rows, cols, rounds, cats)

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
Example #9
0
class TestTreeMethod:
    USE_ONEHOT = np.iinfo(np.int32).max
    USE_PART = 1

    @given(exact_parameter_strategy, strategies.integers(1, 20),
           tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_exact(self, param, num_rounds, dataset):
        if dataset.name.endswith("-l1"):
            return
        param['tree_method'] = 'exact'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
        strategies.integers(1, 20),
        tm.dataset_strategy,
    )
    @settings(deadline=None, print_blob=True)
    def test_approx(self, param, hist_param, num_rounds, dataset):
        param["tree_method"] = "approx"
        param = dataset.set_params(param)
        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_pruner(self):
        import sklearn
        params = {'tree_method': 'exact'}
        cancer = sklearn.datasets.load_breast_cancer()
        X = cancer['data']
        y = cancer["target"]

        dtrain = xgb.DMatrix(X, y)
        booster = xgb.train(params, dtrain=dtrain, num_boost_round=10)
        grown = str(booster.get_dump())

        params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'}
        booster = xgb.train(params,
                            dtrain=dtrain,
                            num_boost_round=10,
                            xgb_model=booster)
        after_prune = str(booster.get_dump())
        assert grown != after_prune

        booster = xgb.train(params,
                            dtrain=dtrain,
                            num_boost_round=10,
                            xgb_model=booster)
        second_prune = str(booster.get_dump())
        # Second prune should not change the tree
        assert after_prune == second_prune

    @given(exact_parameter_strategy, hist_parameter_strategy,
           strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_hist(self, param, hist_param, num_rounds, dataset):
        param['tree_method'] = 'hist'
        param = dataset.set_params(param)
        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(tm.sparse_datasets_strategy)
    @settings(deadline=None, print_blob=True)
    def test_sparse(self, dataset):
        param = {"tree_method": "hist", "max_bin": 64}
        hist_result = train_result(param, dataset.get_dmat(), 16)
        note(hist_result)
        assert tm.non_increasing(hist_result['train'][dataset.metric])

        param = {"tree_method": "approx", "max_bin": 64}
        approx_result = train_result(param, dataset.get_dmat(), 16)
        note(approx_result)
        assert tm.non_increasing(approx_result['train'][dataset.metric])

        np.testing.assert_allclose(hist_result["train"]["rmse"],
                                   approx_result["train"]["rmse"])

    def test_hist_categorical(self):
        # hist must be same as exact on all-categorial data
        dpath = 'demo/data/'
        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        ag_param = {
            'max_depth': 2,
            'tree_method': 'hist',
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'auc'
        }
        hist_res = {}
        exact_res = {}

        xgb.train(ag_param,
                  ag_dtrain,
                  10, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=hist_res)
        ag_param["tree_method"] = "exact"
        xgb.train(ag_param,
                  ag_dtrain,
                  10, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=exact_res)
        assert hist_res['train']['auc'] == exact_res['train']['auc']
        assert hist_res['test']['auc'] == exact_res['test']['auc']

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_hist_degenerate_case(self):
        # Test a degenerate case where the quantile sketcher won't return any
        # quantile points for a particular feature (the second feature in
        # this example). Source: https://github.com/dmlc/xgboost/issues/2943
        nan = np.nan
        param = {'missing': nan, 'tree_method': 'hist'}
        model = xgb.XGBRegressor(**param)
        X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
                      [6.38888889e+05, nan], [6.28086420e+05, nan]])
        y = [1000000., 0., 0., 500000.]
        w = [0, 0, 1, 0]
        model.fit(X, y, sample_weight=w)

    def run_invalid_category(self, tree_method: str) -> None:
        rng = np.random.default_rng()
        # too large
        X = rng.integers(low=0, high=4, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)
        X[13, 7] = np.iinfo(np.int32).max + 1

        # Check is performed during sketching.
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        X[13, 7] = 16777216
        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        # mixed positive and negative values
        X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
        y = rng.normal(loc=0, scale=1, size=100)

        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
        with pytest.raises(ValueError):
            xgb.train({"tree_method": tree_method}, Xy)

        if tree_method == "gpu_hist":
            import cupy as cp

            X, y = cp.array(X), cp.array(y)
            with pytest.raises(ValueError):
                Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)

    def test_invalid_category(self) -> None:
        self.run_invalid_category("approx")
        self.run_invalid_category("hist")

    def run_max_cat(self, tree_method: str) -> None:
        """Test data with size smaller than number of categories."""
        import pandas as pd

        rng = np.random.default_rng(0)
        n_cat = 100
        n = 5

        X = pd.Series(
            [
                "".join(rng.choice(list(ascii_lowercase), size=3))
                for i in range(n_cat)
            ],
            dtype="category",
        )[:n].to_frame()

        reg = xgb.XGBRegressor(
            enable_categorical=True,
            tree_method=tree_method,
            n_estimators=10,
        )
        y = pd.Series(range(n))
        reg.fit(X=X, y=y, eval_set=[(X, y)])
        assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])

    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
    @pytest.mark.skipif(**tm.no_pandas())
    def test_max_cat(self, tree_method) -> None:
        self.run_max_cat(tree_method)

    def run_categorical_missing(self, rows: int, cols: int, cats: int,
                                tree_method: str) -> None:
        parameters: Dict[str, Any] = {"tree_method": tree_method}
        cat, label = tm.make_categorical(n_samples=256,
                                         n_features=4,
                                         n_categories=8,
                                         onehot=False,
                                         sparsity=0.5)
        Xy = xgb.DMatrix(cat, label, enable_categorical=True)

        def run(max_cat_to_onehot: int):
            # Test with onehot splits
            parameters["max_cat_to_onehot"] = max_cat_to_onehot

            evals_result: Dict[str, Dict] = {}
            booster = xgb.train(parameters,
                                Xy,
                                num_boost_round=16,
                                evals=[(Xy, "Train")],
                                evals_result=evals_result)
            assert tm.non_increasing(evals_result["Train"]["rmse"])
            y_predt = booster.predict(Xy)

            rmse = tm.root_mean_square(label, y_predt)
            np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])

        # Test with OHE split
        run(self.USE_ONEHOT)

        if tree_method == "gpu_hist":  # fixme: Test with GPU.
            return

        # Test with partition-based split
        run(self.USE_PART)

    def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
        parameters = {"tree_method": tree_method, "predictor": predictor}
        # Use one-hot exclusively
        parameters["max_cat_to_onehot"] = self.USE_ONEHOT

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

        by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
        # switch to partition-based splits
        parameters["max_cat_to_onehot"] = self.USE_PART
        parameters["reg_lambda"] = 0
        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        rmse_oh = by_builtin_results["Train"]["rmse"]
        rmse_group = by_grouping["Train"]["rmse"]
        # always better or equal to onehot when there's no regularization.
        for a, b in zip(rmse_oh, rmse_group):
            assert a >= b

        parameters["reg_lambda"] = 1.0
        by_grouping = {}
        xgb.train(
            parameters,
            m,
            num_boost_round=32,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None, print_blob=True)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical_ohe(self, rows, cols, rounds, cats):
        self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
        self.run_categorical_ohe(rows, cols, rounds, cats, "hist")

    @given(
        tm.categorical_dataset_strategy,
        exact_parameter_strategy,
        hist_parameter_strategy,
        cat_parameter_strategy,
        strategies.integers(4, 32),
        strategies.sampled_from(["hist", "approx"]),
    )
    @settings(deadline=None, print_blob=True)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(
        self,
        dataset: tm.TestDataset,
        exact_parameters: Dict[str, Any],
        hist_parameters: Dict[str, Any],
        cat_parameters: Dict[str, Any],
        n_rounds: int,
        tree_method: str,
    ) -> None:
        cat_parameters.update(exact_parameters)
        cat_parameters.update(hist_parameters)
        cat_parameters["tree_method"] = tree_method

        results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
        tm.non_increasing(results["train"]["rmse"])

    @given(
        hist_parameter_strategy,
        cat_parameter_strategy,
        strategies.sampled_from(["hist", "approx"]),
    )
    @settings(deadline=None, print_blob=True)
    def test_categorical_ames_housing(
        self,
        hist_parameters: Dict[str, Any],
        cat_parameters: Dict[str, Any],
        tree_method: str,
    ) -> None:
        cat_parameters.update(hist_parameters)
        dataset = tm.TestDataset("ames_housing", tm.get_ames_housing,
                                 "reg:squarederror", "rmse")
        cat_parameters["tree_method"] = tree_method
        results = train_result(cat_parameters, dataset.get_dmat(), 16)
        tm.non_increasing(results["train"]["rmse"])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(4, 7))
    @settings(deadline=None, print_blob=True)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical_missing(self, rows, cols, cats):
        self.run_categorical_missing(rows, cols, cats, "approx")
        self.run_categorical_missing(rows, cols, cats, "hist")
Example #10
0
# -*- coding: utf-8 -*-
import unittest
import pytest

import testing as tm
import xgboost as xgb

try:
    import datatable as dt
    import pandas as pd
except ImportError:
    pass

pytestmark = pytest.mark.skipif(
    tm.no_dt()['condition'] or tm.no_pandas()['condition'],
    reason=tm.no_dt()['reason'] + ' or ' + tm.no_pandas()['reason'])


class TestDataTable(unittest.TestCase):
    def test_dt(self):
        df = pd.DataFrame([[1, 2., True], [2, 3., False]],
                          columns=['a', 'b', 'c'])
        dtable = dt.Frame(df)
        labels = dt.Frame([1, 2])
        dm = xgb.DMatrix(dtable, label=labels)
        assert dm.feature_names == ['a', 'b', 'c']
        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3

        # overwrite feature_names
Example #11
0
class TestGPUUpdaters:
    cputest = test_up.TestTreeMethod()

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
        param["tree_method"] = "gpu_hist"
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
        self.cputest.run_categorical_basic(rows, cols, rounds, cats,
                                           "gpu_hist")

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
        rows = 1000
        cols = 10
        cats = 32
        rounds = 4
        self.cputest.run_categorical_basic(rows, cols, rounds, cats,
                                           "gpu_hist")

    @pytest.mark.skipif(**tm.no_cupy())
    def test_invalid_categorical(self):
        self.cputest.run_invalid_category("gpu_hist")

    @pytest.mark.skipif(**tm.no_cupy())
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_device_dmat(), num_rounds)
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
        param = dataset.set_params(param)
        m = dataset.get_external_dmat()
        external_result = train_result(param, m, num_rounds)
        del m
        gc.collect()
        assert tm.non_increasing(external_result['train'][dataset.metric])

    def test_empty_dmatrix_prediction(self):
        # FIXME(trivialfis): This should be done with all updaters
        kRows = 0
        kCols = 100

        X = np.empty((kRows, kCols))
        y = np.empty((kRows))

        dtrain = xgb.DMatrix(X, y)

        bst = xgb.train(
            {
                'verbosity': 2,
                'tree_method': 'gpu_hist',
                'gpu_id': 0
            },
            dtrain,
            verbose_eval=True,
            num_boost_round=6,
            evals=[(dtrain, 'Train')])

        kRows = 100
        X = np.random.randn(kRows, kCols)

        dtest = xgb.DMatrix(X)
        predictions = bst.predict(dtest)
        np.testing.assert_allclose(predictions, 0.5, 1e-6)

    @pytest.mark.mgpu
    @given(tm.dataset_strategy, strategies.integers(0, 10))
    @settings(deadline=None, max_examples=10)
    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), 10)
        assert tm.non_increasing(result['train'][dataset.metric])
Example #12
0
class TestInplacePredict:
    '''Tests for running inplace prediction'''
    @classmethod
    def setup_class(cls):
        cls.rows = 100
        cls.cols = 10

        cls.rng = np.random.RandomState(1994)

        cls.X = cls.rng.randn(cls.rows, cls.cols)
        cls.y = cls.rng.randn(cls.rows)

        dtrain = xgb.DMatrix(cls.X, cls.y)

        cls.booster = xgb.train({'tree_method': 'hist'},
                                dtrain,
                                num_boost_round=10)

        cls.test = xgb.DMatrix(cls.X[:10, ...])

    def test_predict(self):
        booster = self.booster
        X = self.X
        test = self.test

        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   iteration_range=(0, 4))
        predt_from_dmatrix = booster.predict(test, ntree_limit=4)

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_dense)

        def predict_csr(x):
            inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_csr)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_predict_pd(self):
        X = self.X
        # construct it in column major style
        df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])})
        booster = self.booster
        df_predt = booster.inplace_predict(df)
        arr_predt = booster.inplace_predict(X)
        dmat_predt = booster.predict(xgb.DMatrix(X))

        np.testing.assert_allclose(dmat_predt, arr_predt)
        np.testing.assert_allclose(df_predt, arr_predt)

    def test_base_margin(self):
        booster = self.booster

        base_margin = self.rng.randn(self.rows)
        from_inplace = booster.inplace_predict(data=self.X,
                                               base_margin=base_margin)

        dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        np.testing.assert_allclose(from_dmatrix, from_inplace)
Example #13
0
class TestInplacePredict:
    '''Tests for running inplace prediction'''
    @classmethod
    def setup_class(cls):
        cls.rows = 1000
        cls.cols = 10

        cls.missing = 11  # set to integer for testing

        cls.rng = np.random.RandomState(1994)

        cls.X = cls.rng.randn(cls.rows, cls.cols)
        missing_idx = [i for i in range(0, cls.cols, 4)]
        cls.X[:, missing_idx] = cls.missing  # set to be missing

        cls.y = cls.rng.randn(cls.rows)

        dtrain = xgb.DMatrix(cls.X, cls.y)
        cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)

        cls.num_boost_round = 10
        cls.booster = xgb.train({'tree_method': 'hist'},
                                dtrain,
                                num_boost_round=10)

    def test_predict(self):
        booster = self.booster
        X = self.X
        test = self.test

        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   missing=self.missing)
        predt_from_dmatrix = booster.predict(test)

        X_obj = X.copy().astype(object)

        assert X_obj.dtype.hasobject is True
        assert X.dtype.hasobject is False
        np.testing.assert_allclose(booster.inplace_predict(X_obj),
                                   booster.inplace_predict(X))

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   iteration_range=(0, 4),
                                                   missing=self.missing)
        predt_from_dmatrix = booster.predict(test, ntree_limit=4)

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

        with pytest.raises(ValueError):
            booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
        with pytest.raises(ValueError):
            booster.predict(test,
                            iteration_range=(0, booster.best_iteration + 2))

        default = booster.predict(test)

        range_full = booster.predict(test,
                                     iteration_range=(0, self.num_boost_round))
        ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
        np.testing.assert_allclose(range_full, default)
        np.testing.assert_allclose(ntree_full, default)

        range_full = booster.predict(
            test, iteration_range=(0, booster.best_iteration + 1))
        ntree_full = booster.predict(test,
                                     ntree_limit=booster.best_ntree_limit)
        np.testing.assert_allclose(range_full, default)
        np.testing.assert_allclose(ntree_full, default)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_dense)

        def predict_csr(x):
            inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
            d = xgb.DMatrix(x)
            copied_predt = booster.predict(d)
            return np.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, self.rows, predict_csr)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_predict_pd(self):
        X = self.X
        # construct it in column major style
        df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])})
        booster = self.booster
        df_predt = booster.inplace_predict(df)
        arr_predt = booster.inplace_predict(X)
        dmat_predt = booster.predict(xgb.DMatrix(X))

        X = df.values
        X = np.asfortranarray(X)
        fort_predt = booster.inplace_predict(X)

        np.testing.assert_allclose(dmat_predt, arr_predt)
        np.testing.assert_allclose(df_predt, arr_predt)
        np.testing.assert_allclose(fort_predt, arr_predt)

    def test_base_margin(self):
        booster = self.booster

        base_margin = self.rng.randn(self.rows)
        from_inplace = booster.inplace_predict(data=self.X,
                                               base_margin=base_margin)

        dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        np.testing.assert_allclose(from_dmatrix, from_inplace)
Example #14
0
import unittest
import pytest
import numpy as np
import testing as tm
import xgboost as xgb
import os

try:
    import pyarrow as pa
    import pyarrow.csv as pc
    import pandas as pd
except ImportError:
    pass

pytestmark = pytest.mark.skipif(
    tm.no_arrow()["condition"] or tm.no_pandas()["condition"],
    reason=tm.no_arrow()["reason"] + " or " + tm.no_pandas()["reason"],
)

dpath = "demo/data/"


class TestArrowTable(unittest.TestCase):
    def test_arrow_table(self):
        df = pd.DataFrame(
            [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
        )
        table = pa.Table.from_pandas(df)
        dm = xgb.DMatrix(table)
        assert dm.num_row() == 2
        assert dm.num_col() == 4
Example #15
0
class TestEarlyStopping:
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_early_stopping_nonparallel(self):
        from sklearn.datasets import load_digits
        try:
            from sklearn.model_selection import train_test_split
        except ImportError:
            from sklearn.cross_validation import train_test_split

        digits = load_digits(2)
        X = digits['data']
        y = digits['target']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=0)
        clf1 = xgb.XGBClassifier(learning_rate=0.1)
        clf1.fit(X_train,
                 y_train,
                 early_stopping_rounds=5,
                 eval_metric="auc",
                 eval_set=[(X_test, y_test)])
        clf2 = xgb.XGBClassifier(learning_rate=0.1)
        clf2.fit(X_train,
                 y_train,
                 early_stopping_rounds=4,
                 eval_metric="auc",
                 eval_set=[(X_test, y_test)])
        # should be the same
        assert clf1.best_score == clf2.best_score
        assert clf1.best_score != 1
        # check overfit
        clf3 = xgb.XGBClassifier(learning_rate=0.1)
        clf3.fit(X_train,
                 y_train,
                 early_stopping_rounds=10,
                 eval_metric="auc",
                 eval_set=[(X_test, y_test)])
        assert clf3.best_score == 1

    def evalerror(self, preds, dtrain):
        from sklearn.metrics import mean_squared_error

        labels = dtrain.get_label()
        preds = 1.0 / (1.0 + np.exp(-preds))
        return 'rmse', mean_squared_error(labels, preds)

    @staticmethod
    def assert_metrics_length(cv, expected_length):
        for key, value in cv.items():
            assert len(value) == expected_length

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_cv_early_stopping(self):
        from sklearn.datasets import load_digits

        digits = load_digits(2)
        X = digits['data']
        y = digits['target']
        dm = xgb.DMatrix(X, label=y)
        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'error'
        }

        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    early_stopping_rounds=10)
        self.assert_metrics_length(cv, 10)
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    early_stopping_rounds=5)
        self.assert_metrics_length(cv, 3)
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    early_stopping_rounds=1)
        self.assert_metrics_length(cv, 1)

        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    feval=self.evalerror,
                    early_stopping_rounds=10)
        self.assert_metrics_length(cv, 10)
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    feval=self.evalerror,
                    early_stopping_rounds=1)
        self.assert_metrics_length(cv, 5)
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    feval=self.evalerror,
                    maximize=True,
                    early_stopping_rounds=1)
        self.assert_metrics_length(cv, 1)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self):
        from sklearn.datasets import load_breast_cancer

        X, y = load_breast_cancer(return_X_y=True)
        dm = xgb.DMatrix(X, label=y)
        params = {'objective': 'binary:logistic'}

        metrics = [['auc'], ['error'], ['logloss'], ['logloss', 'auc'],
                   ['logloss', 'error'], ['error', 'logloss']]

        num_iteration_history = []

        # If more than one metrics is given, early stopping should use the last metric
        for i, m in enumerate(metrics):
            result = xgb.cv(params,
                            dm,
                            num_boost_round=1000,
                            nfold=5,
                            stratified=True,
                            metrics=m,
                            early_stopping_rounds=20,
                            seed=42)
            num_iteration_history.append(len(result))
            df = result['test-{}-mean'.format(m[-1])]
            # When early stopping is invoked, the last metric should be as best it can be.
            if m[-1] == 'auc':
                assert np.all(df <= df.iloc[-1])
            else:
                assert np.all(df >= df.iloc[-1])
        assert num_iteration_history[:3] == num_iteration_history[3:]
Example #16
0
class TestModels:
    def test_glm(self):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'booster': 'gblinear',
            'alpha': 0.0001,
            'lambda': 1,
            'nthread': 1
        }
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 4
        bst = xgb.train(param, dtrain, num_round, watchlist)
        assert isinstance(bst, xgb.core.Booster)
        preds = bst.predict(dtest)
        labels = dtest.get_label()
        err = sum(1 for i in range(len(preds))
                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
        assert err < 0.2

    def test_dart(self):
        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        param = {
            'max_depth': 5,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'booster': 'dart',
            'verbosity': 1
        }
        # specify validations set to watch performance
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 2
        bst = xgb.train(param, dtrain, num_round, watchlist)
        # this is prediction
        preds = bst.predict(dtest, ntree_limit=num_round)
        labels = dtest.get_label()
        err = sum(1 for i in range(len(preds))
                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
        # error must be smaller than 10%
        assert err < 0.1

        with tempfile.TemporaryDirectory() as tmpdir:
            dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
            model_path = os.path.join(tmpdir, 'xgboost.model.dart')
            # save dmatrix into binary buffer
            dtest.save_binary(dtest_path)
            model_path = model_path
            # save model
            bst.save_model(model_path)
            # load model and data in
            bst2 = xgb.Booster(params=param, model_file=model_path)
            dtest2 = xgb.DMatrix(dtest_path)

        preds2 = bst2.predict(dtest2, ntree_limit=num_round)

        # assert they are the same
        assert np.sum(np.abs(preds2 - preds)) == 0

        def my_logloss(preds, dtrain):
            labels = dtrain.get_label()
            return 'logloss', np.sum(np.log(np.where(labels, preds,
                                                     1 - preds)))

        # check whether custom evaluation metrics work
        bst = xgb.train(param, dtrain, num_round, watchlist, feval=my_logloss)
        preds3 = bst.predict(dtest, ntree_limit=num_round)
        assert all(preds3 == preds)

        # check whether sample_type and normalize_type work
        num_round = 50
        param['verbosity'] = 0
        param['learning_rate'] = 0.1
        param['rate_drop'] = 0.1
        preds_list = []
        for p in [[p0, p1] for p0 in ['uniform', 'weighted']
                  for p1 in ['tree', 'forest']]:
            param['sample_type'] = p[0]
            param['normalize_type'] = p[1]
            bst = xgb.train(param, dtrain, num_round, watchlist)
            preds = bst.predict(dtest, ntree_limit=num_round)
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
            preds_list.append(preds)

        for ii in range(len(preds_list)):
            for jj in range(ii + 1, len(preds_list)):
                assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0

    def test_boost_from_prediction(self):
        # Re-construct dtrain here to avoid modification
        margined = xgb.DMatrix(dpath + 'agaricus.txt.train')
        bst = xgb.train({'tree_method': 'hist'}, margined, 1)
        predt_0 = bst.predict(margined, output_margin=True)
        margined.set_base_margin(predt_0)
        bst = xgb.train({'tree_method': 'hist'}, margined, 1)
        predt_1 = bst.predict(margined)

        assert np.any(np.abs(predt_1 - predt_0) > 1e-6)

        bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
        predt_2 = bst.predict(dtrain)
        assert np.all(np.abs(predt_2 - predt_1) < 1e-6)

    def test_boost_from_existing_model(self):
        X = xgb.DMatrix(dpath + 'agaricus.txt.train')
        booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
        assert booster.num_boosted_rounds() == 4
        booster = xgb.train({'tree_method': 'hist'},
                            X,
                            num_boost_round=4,
                            xgb_model=booster)
        assert booster.num_boosted_rounds() == 8
        booster = xgb.train({
            'updater': 'prune',
            'process_type': 'update'
        },
                            X,
                            num_boost_round=4,
                            xgb_model=booster)
        # Trees are moved for update, the rounds is reduced.  This test is
        # written for being compatible with current code (1.0.0).  If the
        # behaviour is considered sub-optimal, feel free to change.
        assert booster.num_boosted_rounds() == 4

    def run_custom_objective(self, tree_method=None):
        param = {
            'max_depth': 2,
            'eta': 1,
            'objective': 'reg:logistic',
            "tree_method": tree_method
        }
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 10

        def logregobj(preds, dtrain):
            labels = dtrain.get_label()
            preds = 1.0 / (1.0 + np.exp(-preds))
            grad = preds - labels
            hess = preds * (1.0 - preds)
            return grad, hess

        def evalerror(preds, dtrain):
            labels = dtrain.get_label()
            preds = 1.0 / (1.0 + np.exp(-preds))
            return 'error', float(sum(labels != (preds > 0.5))) / len(labels)

        # test custom_objective in training
        bst = xgb.train(param,
                        dtrain,
                        num_round,
                        watchlist,
                        obj=logregobj,
                        feval=evalerror)
        assert isinstance(bst, xgb.core.Booster)
        preds = bst.predict(dtest)
        labels = dtest.get_label()
        err = sum(1 for i in range(len(preds))
                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
        assert err < 0.1

        # test custom_objective in cross-validation
        xgb.cv(param,
               dtrain,
               num_round,
               nfold=5,
               seed=0,
               obj=logregobj,
               feval=evalerror)

        # test maximize parameter
        def neg_evalerror(preds, dtrain):
            labels = dtrain.get_label()
            return 'error', float(sum(labels == (preds > 0.0))) / len(labels)

        bst2 = xgb.train(param,
                         dtrain,
                         num_round,
                         watchlist,
                         logregobj,
                         neg_evalerror,
                         maximize=True)
        preds2 = bst2.predict(dtest)
        err2 = sum(1 for i in range(len(preds2))
                   if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2))
        assert err == err2

    def test_custom_objective(self):
        self.run_custom_objective()

    def test_multi_eval_metric(self):
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
        param = {
            'max_depth': 2,
            'eta': 0.2,
            'verbosity': 1,
            'objective': 'binary:logistic'
        }
        param['eval_metric'] = ["auc", "logloss", 'error']
        evals_result = {}
        bst = xgb.train(param, dtrain, 4, watchlist, evals_result=evals_result)
        assert isinstance(bst, xgb.core.Booster)
        assert len(evals_result['eval']) == 3
        assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'}

    def test_fpreproc(self):
        param = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic'
        }
        num_round = 2

        def fpreproc(dtrain, dtest, param):
            label = dtrain.get_label()
            ratio = float(np.sum(label == 0)) / np.sum(label == 1)
            param['scale_pos_weight'] = ratio
            return (dtrain, dtest, param)

        xgb.cv(param,
               dtrain,
               num_round,
               nfold=5,
               metrics={'auc'},
               seed=0,
               fpreproc=fpreproc)

    def test_show_stdv(self):
        param = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic'
        }
        num_round = 2
        xgb.cv(param,
               dtrain,
               num_round,
               nfold=5,
               metrics={'error'},
               seed=0,
               show_stdv=False)

    def test_feature_names_validation(self):
        X = np.random.random((10, 3))
        y = np.random.randint(2, size=(10, ))

        dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
        dm2 = xgb.DMatrix(X, y)

        bst = xgb.train([], dm1)
        bst.predict(dm1)  # success
        with pytest.raises(ValueError):
            bst.predict(dm2)
        bst.predict(dm1)  # success

        bst = xgb.train([], dm2)
        bst.predict(dm2)  # success

    def test_model_binary_io(self):
        model_path = 'test_model_binary_io.bin'
        parameters = {
            'tree_method': 'hist',
            'booster': 'gbtree',
            'scale_pos_weight': '0.5'
        }
        X = np.random.random((10, 3))
        y = np.random.random((10, ))
        dtrain = xgb.DMatrix(X, y)
        bst = xgb.train(parameters, dtrain, num_boost_round=2)
        bst.save_model(model_path)
        bst = xgb.Booster(model_file=model_path)
        os.remove(model_path)
        config = json.loads(bst.save_config())
        assert float(config['learner']['objective']['reg_loss_param']
                     ['scale_pos_weight']) == 0.5

        buf = bst.save_raw()
        from_raw = xgb.Booster()
        from_raw.load_model(buf)

        buf_from_raw = from_raw.save_raw()
        assert buf == buf_from_raw

    def test_model_json_io(self):
        loc = locale.getpreferredencoding(False)
        model_path = 'test_model_json_io.json'
        parameters = {'tree_method': 'hist', 'booster': 'gbtree'}
        j_model = json_model(model_path, parameters)
        assert isinstance(j_model['learner'], dict)

        bst = xgb.Booster(model_file=model_path)

        bst.save_model(fname=model_path)
        with open(model_path, 'r') as fd:
            j_model = json.load(fd)
        assert isinstance(j_model['learner'], dict)

        os.remove(model_path)
        assert locale.getpreferredencoding(False) == loc

    @pytest.mark.skipif(**tm.no_json_schema())
    def test_json_io_schema(self):
        import jsonschema
        model_path = 'test_json_schema.json'
        path = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        doc = os.path.join(path, 'doc', 'model.schema')
        with open(doc, 'r') as fd:
            schema = json.load(fd)
        parameters = {'tree_method': 'hist', 'booster': 'gbtree'}
        jsonschema.validate(instance=json_model(model_path, parameters),
                            schema=schema)
        os.remove(model_path)

        parameters = {'tree_method': 'hist', 'booster': 'dart'}
        jsonschema.validate(instance=json_model(model_path, parameters),
                            schema=schema)
        os.remove(model_path)

        try:
            xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
        except ValueError as e:
            e_str = str(e)
            beg = e_str.find('Objective candidate')
            end = e_str.find('Stack trace')
            e_str = e_str[beg:end]
            e_str = e_str.strip()
            splited = e_str.splitlines()
            objectives = [s.split(': ')[1] for s in splited]
            j_objectives = schema['properties']['learner']['properties'][
                'objective']['oneOf']
            objectives_from_schema = set()
            for j_obj in j_objectives:
                objectives_from_schema.add(
                    j_obj['properties']['name']['const'])
            objectives = set(objectives)
            assert objectives == objectives_from_schema

    @pytest.mark.skipif(**tm.no_json_schema())
    def test_json_dump_schema(self):
        import jsonschema

        def validate_model(parameters):
            X = np.random.random((100, 30))
            y = np.random.randint(0, 4, size=(100, ))

            parameters['num_class'] = 4
            m = xgb.DMatrix(X, y)

            booster = xgb.train(parameters, m)
            dump = booster.get_dump(dump_format='json')

            for i in range(len(dump)):
                jsonschema.validate(instance=json.loads(dump[i]),
                                    schema=schema)

        path = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        doc = os.path.join(path, 'doc', 'dump.schema')
        with open(doc, 'r') as fd:
            schema = json.load(fd)

        parameters = {
            'tree_method': 'hist',
            'booster': 'gbtree',
            'objective': 'multi:softmax'
        }
        validate_model(parameters)

        parameters = {
            'tree_method': 'hist',
            'booster': 'dart',
            'objective': 'multi:softmax'
        }
        validate_model(parameters)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_attributes(self):
        from sklearn.datasets import load_iris
        X, y = load_iris(return_X_y=True)
        cls = xgb.XGBClassifier(n_estimators=2)
        cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
        assert cls.get_booster().best_ntree_limit == 2
        assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit

        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "cls.json")
            cls.save_model(path)

            cls = xgb.XGBClassifier(n_estimators=2)
            cls.load_model(path)
            assert cls.get_booster().best_ntree_limit == 2
            assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.parametrize('booster', ['gbtree', 'dart'])
    def test_slice(self, booster):
        from sklearn.datasets import make_classification
        num_classes = 3
        X, y = make_classification(n_samples=1000,
                                   n_informative=5,
                                   n_classes=num_classes)
        dtrain = xgb.DMatrix(data=X, label=y)
        num_parallel_tree = 4
        num_boost_round = 16
        total_trees = num_parallel_tree * num_classes * num_boost_round
        booster = xgb.train(
            {
                'num_parallel_tree': 4,
                'subsample': 0.5,
                'num_class': 3,
                'booster': booster,
                'objective': 'multi:softprob'
            },
            num_boost_round=num_boost_round,
            dtrain=dtrain)
        assert len(booster.get_dump()) == total_trees
        beg = 3
        end = 7
        sliced: xgb.Booster = booster[beg:end]

        sliced_trees = (end - beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced_trees = sliced_trees // 2
        sliced: xgb.Booster = booster[beg:end:2]
        assert sliced_trees == len(sliced.get_dump())

        sliced: xgb.Booster = booster[beg:...]
        sliced_trees = (num_boost_round -
                        beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced: xgb.Booster = booster[beg:]
        sliced_trees = (num_boost_round -
                        beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced: xgb.Booster = booster[:end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced: xgb.Booster = booster[...:end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        with pytest.raises(ValueError, match=r'>= 0'):
            booster[-1:0]

        # we do not accept empty slice.
        with pytest.raises(ValueError):
            booster[1:1]
        # stop can not be smaller than begin
        with pytest.raises(ValueError, match=r'Invalid.*'):
            booster[3:0]
        with pytest.raises(ValueError, match=r'Invalid.*'):
            booster[3:-1]
        # negative step is not supported.
        with pytest.raises(ValueError, match=r'.*>= 1.*'):
            booster[0:2:-1]
        # step can not be 0.
        with pytest.raises(ValueError, match=r'.*>= 1.*'):
            booster[0:2:0]

        trees = [_ for _ in booster]
        assert len(trees) == num_boost_round

        with pytest.raises(TypeError):
            booster["wrong type"]
        with pytest.raises(IndexError):
            booster[:num_boost_round + 1]
        with pytest.raises(ValueError):
            booster[1, 2]  # too many dims
        # setitem is not implemented as model is immutable during slicing.
        with pytest.raises(TypeError):
            booster[...:end] = booster

        sliced_0 = booster[1:3]
        np.testing.assert_allclose(
            booster.predict(dtrain, iteration_range=(1, 3)),
            sliced_0.predict(dtrain))
        sliced_1 = booster[3:7]
        np.testing.assert_allclose(
            booster.predict(dtrain, iteration_range=(3, 7)),
            sliced_1.predict(dtrain))

        predt_0 = sliced_0.predict(dtrain, output_margin=True)
        predt_1 = sliced_1.predict(dtrain, output_margin=True)

        merged = predt_0 + predt_1 - 0.5  # base score.
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)

        sliced_0 = booster[1:7:2]  # 1,3,5
        sliced_1 = booster[2:8:2]  # 2,4,6

        predt_0 = sliced_0.predict(dtrain, output_margin=True)
        predt_1 = sliced_1.predict(dtrain, output_margin=True)

        merged = predt_0 + predt_1 - 0.5
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_feature_info(self):
        import pandas as pd
        rows = 100
        cols = 10
        X = rng.randn(rows, cols)
        y = rng.randn(rows)
        feature_names = ["test_feature_" + str(i) for i in range(cols)]
        X_pd = pd.DataFrame(X, columns=feature_names)
        X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)

        Xy = xgb.DMatrix(X_pd, y)
        assert Xy.feature_types[3] == "int"
        booster = xgb.train({}, dtrain=Xy, num_boost_round=1)

        assert booster.feature_names == Xy.feature_names
        assert booster.feature_names == feature_names
        assert booster.feature_types == Xy.feature_types

        with tempfile.TemporaryDirectory() as tmpdir:
            path = tmpdir + "model.json"
            booster.save_model(path)
            booster = xgb.Booster()
            booster.load_model(path)

            assert booster.feature_names == Xy.feature_names
            assert booster.feature_types == Xy.feature_types
Example #17
0
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None)
    def test_shap(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20)
    def test_shap_interactions(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)
Example #18
0
class TestPlotting:
    def test_plotting(self):
        m = xgb.DMatrix(dpath)
        booster = xgb.train(
            {
                'max_depth': 2,
                'eta': 1,
                'objective': 'binary:logistic'
            },
            m,
            num_boost_round=2)

        ax = xgb.plot_importance(booster)
        assert isinstance(ax, Axes)
        assert ax.get_title() == 'Feature importance'
        assert ax.get_xlabel() == 'F score'
        assert ax.get_ylabel() == 'Features'
        assert len(ax.patches) == 4

        ax = xgb.plot_importance(booster,
                                 color='r',
                                 title='t',
                                 xlabel='x',
                                 ylabel='y')
        assert isinstance(ax, Axes)
        assert ax.get_title() == 't'
        assert ax.get_xlabel() == 'x'
        assert ax.get_ylabel() == 'y'
        assert len(ax.patches) == 4
        for p in ax.patches:
            assert p.get_facecolor() == (1.0, 0, 0, 1.0)  # red

        ax = xgb.plot_importance(booster,
                                 color=['r', 'r', 'b', 'b'],
                                 title=None,
                                 xlabel=None,
                                 ylabel=None)
        assert isinstance(ax, Axes)
        assert ax.get_title() == ''
        assert ax.get_xlabel() == ''
        assert ax.get_ylabel() == ''
        assert len(ax.patches) == 4
        assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0)  # red
        assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0)  # red
        assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0)  # blue
        assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0)  # blue

        g = xgb.to_graphviz(booster, num_trees=0)
        assert isinstance(g, Source)

        ax = xgb.plot_tree(booster, num_trees=0)
        assert isinstance(ax, Axes)

    def test_importance_plot_lim(self):
        np.random.seed(1)
        dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50)
        bst = xgb.train({}, dm)
        assert len(bst.get_fscore()) == 71
        ax = xgb.plot_importance(bst)
        assert ax.get_xlim() == (0., 11.)
        assert ax.get_ylim() == (-1., 71.)

        ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71))
        assert ax.get_xlim() == (0., 5.)
        assert ax.get_ylim() == (10., 71.)

    def run_categorical(self, tree_method: str) -> None:
        X, y = tm.make_categorical(1000, 31, 19, onehot=False)
        reg = xgb.XGBRegressor(enable_categorical=True,
                               n_estimators=10,
                               tree_method=tree_method)
        reg.fit(X, y)
        trees = reg.get_booster().get_dump(dump_format="json")
        for tree in trees:
            j_tree = json.loads(tree)
            assert "leaf" in j_tree.keys() or isinstance(
                j_tree["split_condition"], list)

        graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1)
        assert isinstance(graph, Source)
        ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1)
        assert isinstance(ax, Axes)

    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self) -> None:
        self.run_categorical("approx")
Example #19
0

def test_external_memory_demo():
    script = os.path.join(PYTHON_DEMO_DIR, 'external_memory.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)


def test_evals_result_demo():
    script = os.path.join(PYTHON_DEMO_DIR, 'evals_result.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)


@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.skipif(**tm.no_pandas())
def test_aft_demo():
    script = os.path.join(DEMO_DIR, 'aft_survival', 'aft_survival_demo.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)
    assert os.path.exists('aft_model.json')
    os.remove('aft_model.json')


def test_callbacks_demo():
    script = os.path.join(PYTHON_DEMO_DIR, 'callbacks.py')
    cmd = ['python', script, '--plot=0']
    subprocess.check_call(cmd)


# gpu_acceleration is not tested due to covertype dataset is being too huge.
Example #20
0
class TestPlotting:
    cputest = tp.TestPlotting()

    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self):
        self.cputest.run_categorical("gpu_hist")