Esempio n. 1
0
def test_categorical():
    X, y = tm.make_categorical(n_samples=32, n_features=2, n_categories=3, onehot=False)
    ft = ["c"] * X.shape[1]
    reg = xgb.XGBRegressor(
        tree_method="hist",
        feature_types=ft,
        max_cat_to_onehot=1,
        enable_categorical=True,
    )
    reg.fit(X.values, y, eval_set=[(X.values, y)])
    from_cat = reg.evals_result()["validation_0"]["rmse"]
    predt_cat = reg.predict(X.values)
    assert reg.get_booster().feature_types == ft
    with tempfile.TemporaryDirectory() as tmpdir:
        path = os.path.join(tmpdir, "model.json")
        reg.save_model(path)
        reg = xgb.XGBRegressor()
        reg.load_model(path)
        assert reg.feature_types == ft

    onehot, y = tm.make_categorical(
        n_samples=32, n_features=2, n_categories=3, onehot=True
    )
    reg = xgb.XGBRegressor(tree_method="hist")
    reg.fit(onehot, y, eval_set=[(onehot, y)])
    from_enc = reg.evals_result()["validation_0"]["rmse"]
    predt_enc = reg.predict(onehot)

    np.testing.assert_allclose(from_cat, from_enc)
    np.testing.assert_allclose(predt_cat, predt_enc)
Esempio n. 2
0
    def __init__(self, categorical):
        '''Generate some random data for demostration.

        Actual data can be anything that is currently supported by XGBoost.
        '''
        import cudf
        self.rows = self.ROWS_PER_BATCH

        if categorical:
            self._data = []
            self._labels = []
            for i in range(self.BATCHES):
                X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
                self._data.append(cudf.from_pandas(X))
                self._labels.append(y)
        else:
            rng = np.random.RandomState(1994)
            self._data = [
                cudf.DataFrame({
                    'a': rng.randn(self.ROWS_PER_BATCH),
                    'b': rng.randn(self.ROWS_PER_BATCH)
                })
            ] * self.BATCHES
            self._labels = [rng.randn(self.rows)] * self.BATCHES

        self.it = 0  # set iterator to 0
        super().__init__()
Esempio n. 3
0
    def test_scipy_categorical(self):
        from scipy import sparse
        n_features = 10
        X, y = tm.make_categorical(10,
                                   n_features,
                                   n_categories=4,
                                   onehot=False)
        X = X.values.astype(np.float32)
        feature_types = ['c'] * n_features

        X[1, 3] = np.NAN
        X[2, 4] = np.NAN
        X = sparse.csr_matrix(X)

        Xy = xgb.DMatrix(X, y, feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types),
                                np.array(feature_types))

        X = sparse.csc_matrix(X)

        Xy = xgb.DMatrix(X, y, feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types),
                                np.array(feature_types))

        X = sparse.coo_matrix(X)

        Xy = xgb.DMatrix(X, y, feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types),
                                np.array(feature_types))
Esempio n. 4
0
    def run_categorical_missing(self, rows: int, cols: int, cats: int,
                                tree_method: str) -> None:
        parameters: Dict[str, Any] = {"tree_method": tree_method}
        cat, label = tm.make_categorical(n_samples=256,
                                         n_features=4,
                                         n_categories=8,
                                         onehot=False,
                                         sparsity=0.5)
        Xy = xgb.DMatrix(cat, label, enable_categorical=True)

        def run(max_cat_to_onehot: int):
            # Test with onehot splits
            parameters["max_cat_to_onehot"] = max_cat_to_onehot

            evals_result: Dict[str, Dict] = {}
            booster = xgb.train(parameters,
                                Xy,
                                num_boost_round=16,
                                evals=[(Xy, "Train")],
                                evals_result=evals_result)
            assert tm.non_increasing(evals_result["Train"]["rmse"])
            y_predt = booster.predict(Xy)

            rmse = tm.root_mean_square(label, y_predt)
            np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])

        # Test with OHE split
        run(self.USE_ONEHOT)

        if tree_method == "gpu_hist":  # fixme: Test with GPU.
            return

        # Test with partition-based split
        run(self.USE_PART)
Esempio n. 5
0
    def run_split_value_histograms(self, tree_method) -> None:
        X, y = tm.make_categorical(1000, 10, 13, False)
        reg = xgb.XGBRegressor(tree_method=tree_method,
                               enable_categorical=True)
        reg.fit(X, y)

        with pytest.raises(ValueError, match="doesn't"):
            reg.get_booster().get_split_value_histogram("3", bins=5)
Esempio n. 6
0
 def run_tree_to_df_categorical(self, tree_method: str) -> None:
     X, y = tm.make_categorical(100, 10, 31, False)
     Xy = xgb.DMatrix(X, y, enable_categorical=True)
     booster = xgb.train({"tree_method": tree_method},
                         Xy,
                         num_boost_round=10)
     df = booster.trees_to_dataframe()
     for _, x in df.iterrows():
         if x["Feature"] != "Leaf":
             assert len(x["Category"]) >= 1
Esempio n. 7
0
    def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
        # Use one-hot exclusively
        parameters = {
            "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999
        }

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
Esempio n. 8
0
    def test_np_categorical(self):
        n_features = 10
        X, y = tm.make_categorical(10,
                                   n_features,
                                   n_categories=4,
                                   onehot=False)
        X = X.values.astype(np.float32)
        feature_types = ['c'] * n_features

        assert isinstance(X, np.ndarray)
        Xy = xgb.DMatrix(X, y, feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types),
                                np.array(feature_types))
Esempio n. 9
0
    def test_categorical(self):
        import cudf
        _X, _y = tm.make_categorical(100, 30, 17, False)
        X = cudf.from_pandas(_X)
        y = cudf.from_pandas(_y)

        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "categorical" for t in Xy.feature_types)

        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "categorical" for t in Xy.feature_types)
Esempio n. 10
0
    def test_cupy_categorical(self):
        import cupy as cp
        n_features = 10
        X, y = tm.make_categorical(10,
                                   n_features,
                                   n_categories=4,
                                   onehot=False)
        X = cp.asarray(X.values.astype(cp.float32))
        y = cp.array(y)
        feature_types = ['c'] * n_features

        assert isinstance(X, cp.ndarray)
        Xy = xgb.DMatrix(X, y, feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types),
                                np.array(feature_types))
Esempio n. 11
0
    def run_categorical(self, tree_method: str) -> None:
        X, y = tm.make_categorical(1000, 31, 19, onehot=False)
        reg = xgb.XGBRegressor(enable_categorical=True,
                               n_estimators=10,
                               tree_method=tree_method)
        reg.fit(X, y)
        trees = reg.get_booster().get_dump(dump_format="json")
        for tree in trees:
            j_tree = json.loads(tree)
            assert "leaf" in j_tree.keys() or isinstance(
                j_tree["split_condition"], list)

        graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1)
        assert isinstance(graph, Source)
        ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1)
        assert isinstance(ax, Axes)
Esempio n. 12
0
    def test_cudf_categorical(self):
        import cudf
        _X, _y = tm.make_categorical(100, 30, 17, False)
        X = cudf.from_pandas(_X)
        y = cudf.from_pandas(_y)

        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "c" for t in Xy.feature_types)

        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "c" for t in Xy.feature_types)

        # test missing value
        X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
        X["f0"] = X["f0"].astype("category")
        df, cat_codes, _, _ = xgb.data._transform_cudf_df(
            X, None, None, enable_categorical=True)
        for col in cat_codes:
            assert col.has_nulls

        y = [0, 1, 2]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1

        with pytest.raises(ValueError):
            xgb.DeviceQuantileDMatrix(X, y)

        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1

        X = X["f0"]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)

        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1
Esempio n. 13
0
    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "gpu_hist"},
                            Xy,
                            num_boost_round=10)

        booster.set_param({"predictor": "gpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

        booster.set_param({"predictor": "cpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)
Esempio n. 14
0
    def test_categorical_model_io(self):
        X, y = tm.make_categorical(256, 16, 71, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16)
        predt_0 = booster.predict(Xy)

        with tempfile.TemporaryDirectory() as tempdir:
            path = os.path.join(tempdir, "model.binary")
            with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"):
                booster.save_model(path)

            path = os.path.join(tempdir, "model.json")
            booster.save_model(path)
            booster = xgb.Booster(model_file=path)
            predt_1 = booster.predict(Xy)
            np.testing.assert_allclose(predt_0, predt_1)

            path = os.path.join(tempdir, "model.ubj")
            booster.save_model(path)
            booster = xgb.Booster(model_file=path)
            predt_1 = booster.predict(Xy)
            np.testing.assert_allclose(predt_0, predt_1)
Esempio n. 15
0
 def pack(**kwargs: Any) -> dd.DataFrame:
     X, y = tm.make_categorical(**kwargs)
     X["label"] = y
     return X
Esempio n. 16
0
    def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

        by_etl_results = {}
        by_builtin_results = {}

        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
        parameters = {"tree_method": tree_method, "predictor": predictor}
        # Use one-hot exclusively
        parameters["max_cat_to_onehot"] = self.USE_ONEHOT

        m = xgb.DMatrix(onehot, label, enable_categorical=False)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_etl_results,
        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_builtin_results,
        )

        # There are guidelines on how to specify tolerance based on considering output as
        # random variables. But in here the tree construction is extremely sensitive to
        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
        # different tree.  So even though the test is quite lenient, hypothesis can still
        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

        by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
        # switch to partition-based splits
        parameters["max_cat_to_onehot"] = self.USE_PART
        parameters["reg_lambda"] = 0
        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        rmse_oh = by_builtin_results["Train"]["rmse"]
        rmse_group = by_grouping["Train"]["rmse"]
        # always better or equal to onehot when there's no regularization.
        for a, b in zip(rmse_oh, rmse_group):
            assert a >= b

        parameters["reg_lambda"] = 1.0
        by_grouping = {}
        xgb.train(
            parameters,
            m,
            num_boost_round=32,
            evals=[(m, "Train")],
            evals_result=by_grouping,
        )
        assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping