Ejemplo n.º 1
0
    def test_custom_objective(self, client: "Client") -> None:
        from sklearn.datasets import load_boston
        X, y = load_boston(return_X_y=True)
        X, y = da.from_array(X), da.from_array(y)
        rounds = 20

        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, 'log')

            def sqr(labels: np.ndarray, predts: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
                with open(path, 'a') as fd:
                    print('Running sqr', file=fd)
                grad = predts - labels
                hess = np.ones(shape=labels.shape[0])
                return grad, hess

            reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, objective=sqr,
                                            tree_method='hist')
            reg.fit(X, y, eval_set=[(X, y)])

            # Check the obj is ran for rounds.
            with open(path, 'r') as fd:
                out = fd.readlines()
                assert len(out) == rounds

            results_custom = reg.evals_result()

            reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, tree_method='hist')
            reg.fit(X, y, eval_set=[(X, y)])
            results_native = reg.evals_result()

            np.testing.assert_allclose(results_custom['validation_0']['rmse'],
                                       results_native['validation_0']['rmse'])
            tm.non_increasing(results_native['validation_0']['rmse'])
Ejemplo n.º 2
0
 def test_categorical_ames_housing(
     self,
     hist_parameters: Dict[str, Any],
     cat_parameters: Dict[str, Any],
     tree_method: str,
 ) -> None:
     cat_parameters.update(hist_parameters)
     dataset = tm.TestDataset("ames_housing", tm.get_ames_housing,
                              "reg:squarederror", "rmse")
     cat_parameters["tree_method"] = tree_method
     results = train_result(cat_parameters, dataset.get_dmat(), 16)
     tm.non_increasing(results["train"]["rmse"])
Ejemplo n.º 3
0
    def test_sparse(self, dataset):
        param = {"tree_method": "hist", "max_bin": 64}
        hist_result = train_result(param, dataset.get_dmat(), 16)
        note(hist_result)
        assert tm.non_increasing(hist_result['train'][dataset.metric])

        param = {"tree_method": "approx", "max_bin": 64}
        approx_result = train_result(param, dataset.get_dmat(), 16)
        note(approx_result)
        assert tm.non_increasing(approx_result['train'][dataset.metric])

        np.testing.assert_allclose(hist_result["train"]["rmse"],
                                   approx_result["train"]["rmse"])
Ejemplo n.º 4
0
    def test_categorical(
        self,
        dataset: tm.TestDataset,
        exact_parameters: Dict[str, Any],
        hist_parameters: Dict[str, Any],
        cat_parameters: Dict[str, Any],
        n_rounds: int,
        tree_method: str,
    ) -> None:
        cat_parameters.update(exact_parameters)
        cat_parameters.update(hist_parameters)
        cat_parameters["tree_method"] = tree_method

        results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
        tm.non_increasing(results["train"]["rmse"])
Ejemplo n.º 5
0
    def test_changed_parameter(self):
        from sklearn.datasets import load_breast_cancer
        X, y = load_breast_cancer(return_X_y=True)
        clf = xgb.XGBClassifier(n_estimators=2)
        clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
        assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])

        with tempfile.TemporaryDirectory() as tmpdir:
            clf.save_model(os.path.join(tmpdir, "clf.json"))
            loaded = xgb.XGBClassifier()
            loaded.load_model(os.path.join(tmpdir, "clf.json"))

        clf = xgb.XGBClassifier(n_estimators=2)
        # change metric to error
        clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
        assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
Ejemplo n.º 6
0
    def run_updater_test(self, client, params, num_rounds, dataset,
                         tree_method):
        params['tree_method'] = tree_method
        params = dataset.set_params(params)
        # It doesn't make sense to distribute a completely
        # empty dataset.
        if dataset.X.shape[0] == 0:
            return

        chunk = 128
        X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1]))
        y = da.from_array(dataset.y, chunks=(chunk, ))
        if dataset.w is not None:
            w = da.from_array(dataset.w, chunks=(chunk, ))
        else:
            w = None

        m = xgb.dask.DaskDMatrix(client, data=X, label=y, weight=w)
        history = xgb.dask.train(client,
                                 params=params,
                                 dtrain=m,
                                 num_boost_round=num_rounds,
                                 evals=[(m, 'train')])['history']
        note(history)
        history = history['train'][dataset.metric]
        assert tm.non_increasing(history)
        # Make sure that it's decreasing
        assert history[-1] < history[0]
Ejemplo n.º 7
0
    def run_categorical_basic(self, cat, onehot, label, rounds):
        by_etl_results = {}
        by_builtin_results = {}

        parameters = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'enable_experimental_json_serialization': True
        }

        m = xgb.DMatrix(onehot, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_etl_results)

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_builtin_results)
        np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']),
                                   np.array(
                                       by_builtin_results['Train']['rmse']),
                                   rtol=1e-3)
        assert tm.non_increasing(by_builtin_results['Train']['rmse'])
Ejemplo n.º 8
0
 def test_approx(self, param, hist_param, num_rounds, dataset):
     param["tree_method"] = "approx"
     param = dataset.set_params(param)
     param.update(hist_param)
     result = train_result(param, dataset.get_dmat(), num_rounds)
     note(result)
     assert tm.non_increasing(result["train"][dataset.metric])
Ejemplo n.º 9
0
 def test_external_memory(self, param, num_rounds, dataset):
     # We cannot handle empty dataset yet
     assume(len(dataset.y) > 0)
     param['tree_method'] = 'gpu_hist'
     param = dataset.set_params(param)
     external_result = train_result(param, dataset.get_external_dmat(), num_rounds)
     assert tm.non_increasing(external_result['train'][dataset.metric])
Ejemplo n.º 10
0
 def test_coordinate(self, param, num_rounds, dataset, coord_param):
     param['updater'] = 'coord_descent'
     param.update(coord_param)
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     assert tm.non_increasing(result, 5e-4)
Ejemplo n.º 11
0
def run_gpu_hist(params, num_rounds, dataset, DMatrixT, client):
    params['tree_method'] = 'gpu_hist'
    params = dataset.set_params(params)
    # It doesn't make sense to distribute a completely
    # empty dataset.
    if dataset.X.shape[0] == 0:
        return

    chunk = 128
    X = to_cp(dataset.X, DMatrixT)
    X = da.from_array(X,
                      chunks=(chunk, dataset.X.shape[1]))
    y = to_cp(dataset.y, DMatrixT)
    y = da.from_array(y, chunks=(chunk, ))
    if dataset.w is not None:
        w = to_cp(dataset.w, DMatrixT)
        w = da.from_array(w, chunks=(chunk, ))
    else:
        w = None

    if DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
        m = DMatrixT(client, data=X, label=y, weight=w,
                     max_bin=params.get('max_bin', 256))
    else:
        m = DMatrixT(client, data=X, label=y, weight=w)
    history = dxgb.train(client, params=params, dtrain=m,
                         num_boost_round=num_rounds,
                         evals=[(m, 'train')])['history']
    note(history)
    assert tm.non_increasing(history['train'][dataset.metric])
Ejemplo n.º 12
0
    def test_gpu_hist(self, params, num_rounds, dataset):
        with LocalCUDACluster(n_workers=2) as cluster:
            with Client(cluster) as client:
                params['tree_method'] = 'gpu_hist'
                params = dataset.set_params(params)
                # multi class doesn't handle empty dataset well (empty
                # means at least 1 worker has data).
                if params['objective'] == "multi:softmax":
                    return
                # It doesn't make sense to distribute a completely
                # empty dataset.
                if dataset.X.shape[0] == 0:
                    return

                chunk = 128
                X = da.from_array(dataset.X,
                                  chunks=(chunk, dataset.X.shape[1]))
                y = da.from_array(dataset.y, chunks=(chunk, ))
                if dataset.w is not None:
                    w = da.from_array(dataset.w, chunks=(chunk, ))
                else:
                    w = None

                m = dxgb.DaskDMatrix(client, data=X, label=y, weight=w)
                history = dxgb.train(client,
                                     params=params,
                                     dtrain=m,
                                     num_boost_round=num_rounds,
                                     evals=[(m, 'train')])['history']
                note(history)
                assert tm.non_increasing(history['train'][dataset.metric])
Ejemplo n.º 13
0
 def test_exact(self, param, num_rounds, dataset):
     if dataset.name.endswith("-l1"):
         return
     param['tree_method'] = 'exact'
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(), num_rounds)
     assert tm.non_increasing(result['train'][dataset.metric])
Ejemplo n.º 14
0
 def test_hist(self, param, hist_param, num_rounds, dataset):
     param['tree_method'] = 'hist'
     param = dataset.set_params(param)
     param.update(hist_param)
     result = train_result(param, dataset.get_dmat(), num_rounds)
     note(result)
     assert tm.non_increasing(result['train'][dataset.metric])
Ejemplo n.º 15
0
 def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
     assume(len(dataset.y) > 0)
     param['updater'] = 'gpu_coord_descent'
     param['alpha'] = alpha
     param['lambda'] = lambd
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
     assert tm.non_increasing([result[0], result[-1]])
Ejemplo n.º 16
0
 def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
     # We cannot handle empty dataset yet
     assume(len(dataset.y) > 0)
     param['tree_method'] = 'gpu_hist'
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_device_dmat(), num_rounds)
     note(result)
     assert tm.non_increasing(result['train'][dataset.metric])
Ejemplo n.º 17
0
 def test_gpu_coordinate(self, param, num_rounds, dataset):
     assume(len(dataset.y) > 0)
     param['updater'] = 'gpu_coord_descent'
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     note(result)
     assert tm.non_increasing(result)
Ejemplo n.º 18
0
 def test_shotgun_regularised(self, param, num_rounds, dataset, alpha,
                              lambd):
     param['updater'] = 'shotgun'
     param['alpha'] = alpha
     param['lambda'] = lambd
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     assert tm.non_increasing([result[0], result[-1]])
Ejemplo n.º 19
0
def run_gpu_hist(
    params: Dict,
    num_rounds: int,
    dataset: tm.TestDataset,
    DMatrixT: Type,
    client: Client,
) -> None:
    params["tree_method"] = "gpu_hist"
    params = dataset.set_params(params)
    # It doesn't make sense to distribute a completely
    # empty dataset.
    if dataset.X.shape[0] == 0:
        return

    chunk = 128
    X = to_cp(dataset.X, DMatrixT)
    X = da.from_array(X, chunks=(chunk, dataset.X.shape[1]))
    y = to_cp(dataset.y, DMatrixT)
    y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk,
                                                       dataset.y.shape[1])
    y = da.from_array(y, chunks=y_chunk)

    if dataset.w is not None:
        w = to_cp(dataset.w, DMatrixT)
        w = da.from_array(w, chunks=(chunk, ))
    else:
        w = None

    if DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
        m = DMatrixT(client,
                     data=X,
                     label=y,
                     weight=w,
                     max_bin=params.get("max_bin", 256))
    else:
        m = DMatrixT(client, data=X, label=y, weight=w)
    history = dxgb.train(
        client,
        params=params,
        dtrain=m,
        num_boost_round=num_rounds,
        evals=[(m, "train")],
    )["history"]["train"][dataset.metric]
    note(history)

    # See note on `ObjFunction::UpdateTreeLeaf`.
    update_leaf = dataset.name.endswith("-l1")
    if update_leaf and len(history) == 2:
        assert history[0] + 1e-2 >= history[-1]
        return
    if update_leaf and len(history) > 2:
        assert history[0] >= history[-1]
        return
    else:
        assert tm.non_increasing(history)
Ejemplo n.º 20
0
 def test_coordinate_regularised(self, param, num_rounds, dataset,
                                 coord_param, alpha, lambd):
     param['updater'] = 'coord_descent'
     param['alpha'] = alpha
     param['lambda'] = lambd
     param.update(coord_param)
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     note(result)
     assert tm.non_increasing([result[0], result[-1]])
Ejemplo n.º 21
0
 def test_external_memory(self, param, num_rounds, dataset):
     pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
     # We cannot handle empty dataset yet
     assume(len(dataset.y) > 0)
     param['tree_method'] = 'gpu_hist'
     param = dataset.set_params(param)
     m = dataset.get_external_dmat()
     external_result = train_result(param, m, num_rounds)
     del m
     gc.collect()
     assert tm.non_increasing(external_result['train'][dataset.metric])
Ejemplo n.º 22
0
 def test_external_memory(self, param, num_rounds, dataset):
     if dataset.name.endswith("-l1"):
         return
     # We cannot handle empty dataset yet
     assume(len(dataset.y) > 0)
     param['tree_method'] = 'gpu_hist'
     param = dataset.set_params(param)
     m = dataset.get_external_dmat()
     external_result = train_result(param, m, num_rounds)
     del m
     gc.collect()
     assert tm.non_increasing(external_result['train'][dataset.metric])
Ejemplo n.º 23
0
 def test_shotgun(self, param, num_rounds, dataset):
     param['updater'] = 'shotgun'
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     # shotgun is non-deterministic, so we relax the test by only using first and last
     # iteration.
     if len(result) > 2:
         sampled_result = (result[0], result[-1])
     else:
         sampled_result = result
     assert tm.non_increasing(sampled_result)
Ejemplo n.º 24
0
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
    with Client(local_cuda_cluster) as client:
        import dask_cudf

        rounds = 10
        X, y = make_categorical(client, 10000, 30, 13)
        X = dask_cudf.from_dask_dataframe(X)

        X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)

        parameters = {"tree_method": "gpu_hist"}

        m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
        by_etl_results = dxgb.train(
            client,
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
        )["history"]

        m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
        output = dxgb.train(
            client,
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
        )
        by_builtin_results = output["history"]

        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

        model = output["booster"]
        with tempfile.TemporaryDirectory() as tempdir:
            path = os.path.join(tempdir, "model.json")
            model.save_model(path)
            with open(path, "r") as fd:
                categorical = json.load(fd)

            categories_sizes = np.array(
                categorical["learner"]["gradient_booster"]["model"]["trees"]
                [-1]["categories_sizes"])
            assert categories_sizes.shape[0] != 0
            np.testing.assert_allclose(categories_sizes, 1)
Ejemplo n.º 25
0
    def test_early_stopping(self, client: "Client") -> None:
        from sklearn.datasets import load_breast_cancer
        X, y = load_breast_cancer(return_X_y=True)
        X, y = da.from_array(X), da.from_array(y)
        m = xgb.dask.DaskDMatrix(client, X, y)

        valid = xgb.dask.DaskDMatrix(client, X, y)
        early_stopping_rounds = 5
        booster = xgb.dask.train(
            client, {
                'objective': 'binary:logistic',
                'eval_metric': 'error',
                'tree_method': 'hist'
            },
            m,
            evals=[(valid, 'Valid')],
            num_boost_round=1000,
            early_stopping_rounds=early_stopping_rounds)['booster']
        assert hasattr(booster, 'best_score')
        dump = booster.get_dump(dump_format='json')
        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

        valid_X, valid_y = load_breast_cancer(return_X_y=True)
        valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
        cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic',
                                         tree_method='hist',
                                         n_estimators=1000)
        cls.client = client
        cls.fit(X,
                y,
                early_stopping_rounds=early_stopping_rounds,
                eval_set=[(valid_X, valid_y)])
        booster = cls.get_booster()
        dump = booster.get_dump(dump_format='json')
        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

        # Specify the metric
        cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic',
                                         tree_method='hist',
                                         n_estimators=1000)
        cls.client = client
        cls.fit(X,
                y,
                early_stopping_rounds=early_stopping_rounds,
                eval_set=[(valid_X, valid_y)],
                eval_metric='error')
        assert tm.non_increasing(cls.evals_result()['validation_0']['error'])
        booster = cls.get_booster()
        dump = booster.get_dump(dump_format='json')
        assert len(cls.evals_result()['validation_0']['error']) < 20
        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
Ejemplo n.º 26
0
 def test_shotgun(self, param, num_rounds, dataset):
     param['updater'] = 'shotgun'
     param = dataset.set_params(param)
     result = train_result(param, dataset.get_dmat(),
                           num_rounds)['train'][dataset.metric]
     # shotgun is non-deterministic, so we relax the test by sampling
     # result.
     if len(result) > 2:
         sampled_result = [
             score for i, score in enumerate(result) if i % 2 == 0
         ]
         sampled_result[-1] = result[-1]  # make sure the last one is used
     else:
         sampled_result = result
     assert tm.non_increasing(sampled_result, 1e-3)
Ejemplo n.º 27
0
        def run(max_cat_to_onehot: int):
            # Test with onehot splits
            parameters["max_cat_to_onehot"] = max_cat_to_onehot

            evals_result: Dict[str, Dict] = {}
            booster = xgb.train(parameters,
                                Xy,
                                num_boost_round=16,
                                evals=[(Xy, "Train")],
                                evals_result=evals_result)
            assert tm.non_increasing(evals_result["Train"]["rmse"])
            y_predt = booster.predict(Xy)

            rmse = tm.root_mean_square(label, y_predt)
            np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
Ejemplo n.º 28
0
    def run_categorical_basic(self, rows, cols, rounds, cats):
        import pandas as pd
        rng = np.random.RandomState(1994)

        pd_dict = {}
        for i in range(cols):
            c = rng.randint(low=0, high=cats + 1, size=rows)
            pd_dict[str(i)] = pd.Series(c, dtype=np.int64)

        df = pd.DataFrame(pd_dict)
        label = df.iloc[:, 0]
        for i in range(0, cols - 1):
            label += df.iloc[:, i]
        label += 1
        df = df.astype('category')
        onehot = pd.get_dummies(df)
        cat = df

        by_etl_results = {}
        by_builtin_results = {}

        parameters = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'enable_experimental_json_serialization': True
        }

        m = xgb.DMatrix(onehot, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_etl_results)

        m = xgb.DMatrix(cat, label, enable_categorical=True)
        xgb.train(parameters,
                  m,
                  num_boost_round=rounds,
                  evals=[(m, 'Train')],
                  evals_result=by_builtin_results)
        np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']),
                                   np.array(
                                       by_builtin_results['Train']['rmse']),
                                   rtol=1e-3)
        assert tm.non_increasing(by_builtin_results['Train']['rmse'])
Ejemplo n.º 29
0
    def run_max_cat(self, tree_method: str) -> None:
        """Test data with size smaller than number of categories."""
        import pandas as pd
        n_cat = 100
        n = 5
        X = pd.Series(
            ["".join(choice(ascii_lowercase) for i in range(3)) for i in range(n_cat)],
            dtype="category",
        )[:n].to_frame()

        reg = xgb.XGBRegressor(
            enable_categorical=True,
            tree_method=tree_method,
            n_estimators=10,
        )
        y = pd.Series(range(n))
        reg.fit(X=X, y=y, eval_set=[(X, y)])
        assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
Ejemplo n.º 30
0
def run_gpu_hist(
    params: Dict,
    num_rounds: int,
    dataset: tm.TestDataset,
    DMatrixT: Type,
    client: Client,
) -> None:
    params["tree_method"] = "gpu_hist"
    params = dataset.set_params(params)
    # It doesn't make sense to distribute a completely
    # empty dataset.
    if dataset.X.shape[0] == 0:
        return

    chunk = 128
    X = to_cp(dataset.X, DMatrixT)
    X = da.from_array(X, chunks=(chunk, dataset.X.shape[1]))
    y = to_cp(dataset.y, DMatrixT)
    y = da.from_array(y, chunks=(chunk, ))
    if dataset.w is not None:
        w = to_cp(dataset.w, DMatrixT)
        w = da.from_array(w, chunks=(chunk, ))
    else:
        w = None

    if DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
        m = DMatrixT(client,
                     data=X,
                     label=y,
                     weight=w,
                     max_bin=params.get("max_bin", 256))
    else:
        m = DMatrixT(client, data=X, label=y, weight=w)
    history = dxgb.train(
        client,
        params=params,
        dtrain=m,
        num_boost_round=num_rounds,
        evals=[(m, "train")],
    )["history"]
    note(history)
    assert tm.non_increasing(history["train"][dataset.metric])