Example #1
0
    def test_parameters_access(self):
        self._init_ray()

        from sklearn import datasets

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        clf = RayXGBClassifier(n_estimators=1000, **params)
        assert clf.get_params()["updater"] == "grow_gpu_hist"
        assert clf.get_params()["subsample"] == 0.5
        assert clf.get_params()["n_estimators"] == 1000

        clf = RayXGBClassifier(n_estimators=1, nthread=4)
        X, y = datasets.load_iris(return_X_y=True)
        clf.fit(X, y)

        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 4

        clf.set_params(nthread=16)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16

        clf.predict(X)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16
Example #2
0
    def run_boost_from_prediction(self, tree_method):
        from sklearn.datasets import load_breast_cancer

        X, y = load_breast_cancer(return_X_y=True)
        model_0 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_0.fit(X=X, y=y)
        margin = model_0.predict(X, output_margin=True)

        model_1 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_1.fit(X=X, y=y, base_margin=margin)
        predictions_1 = model_1.predict(X, base_margin=margin)

        cls_2 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=8,
            tree_method=tree_method,
        )
        cls_2.fit(X=X, y=y)
        predictions_2 = cls_2.predict(X)
        assert np.all(predictions_1 == predictions_2)
Example #3
0
    def test_multiclass_classification(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import KFold

        def check_pred(preds, labels, output_margin):
            if output_margin:
                err = sum(1 for i in range(len(preds))
                          if preds[i].argmax() != labels[i]) / float(
                              len(preds))
            else:
                err = sum(1 for i in range(len(preds))
                          if preds[i] != labels[i]) / float(len(preds))
            assert err < 0.4

        iris = load_iris()
        y = iris["target"]
        X = iris["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index])
            if hasattr(xgb_model.get_booster(), "num_boosted_rounds"):
                assert (xgb_model.get_booster().num_boosted_rounds() ==
                        xgb_model.n_estimators)
            preds = xgb_model.predict(X[test_index])
            # test other params in XGBClassifier().fit
            preds2 = xgb_model.predict(
                X[test_index], output_margin=True, ntree_limit=3)
            preds3 = xgb_model.predict(
                X[test_index], output_margin=True, ntree_limit=0)
            preds4 = xgb_model.predict(
                X[test_index], output_margin=False, ntree_limit=3)
            labels = y[test_index]

            check_pred(preds, labels, output_margin=False)
            check_pred(preds2, labels, output_margin=True)
            check_pred(preds3, labels, output_margin=True)
            check_pred(preds4, labels, output_margin=False)

        cls = RayXGBClassifier(n_estimators=4).fit(X, y)
        assert cls.n_classes_ == 3
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_

        # custom objective, the default is multi:softprob
        # so no transformation is required.
        cls = RayXGBClassifier(
            n_estimators=4, objective=softprob_obj(3)).fit(X, y)
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_
    def testClassifierLabelEncoder(self, n_class=2):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits = load_digits(n_class=n_class)
        y = digits["target"]
        X = digits["data"]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5)

        train_matrix = RayDMatrix(X_train, y_train)
        test_matrix = RayDMatrix(X_test, y_test)

        with self.assertRaisesRegex(Exception, "use_label_encoder"):
            RayXGBClassifier(use_label_encoder=True,
                             **self.params).fit(train_matrix, None)

        with self.assertRaisesRegex(Exception, "num_class"):
            RayXGBClassifier(use_label_encoder=False,
                             **self.params).fit(train_matrix, None)

        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
            RayXGBClassifier(use_label_encoder=False,
                             **self.params).fit(train_matrix,
                                                None,
                                                eval_set=[(X_test, y_test)])

        with self.assertRaisesRegex(Exception,
                                    r"must be \(array_like, array_like\)"):
            RayXGBClassifier(use_label_encoder=False,
                             **self.params).fit(X_train,
                                                y_train,
                                                eval_set=[(test_matrix, "eval")
                                                          ])

        RayXGBClassifier(use_label_encoder=False,
                         num_class=n_class,
                         **self.params).fit(train_matrix, None)

        clf = RayXGBClassifier(use_label_encoder=False,
                               num_class=n_class,
                               **self.params).fit(train_matrix,
                                                  None,
                                                  eval_set=[(test_matrix,
                                                             "eval")])

        clf.predict(test_matrix)
        clf.predict_proba(test_matrix)
Example #5
0
        def train(booster, forest):
            rounds = 4
            cls = RayXGBClassifier(
                n_estimators=rounds, num_parallel_tree=forest,
                booster=booster).fit(
                    X, y, eval_set=[(X, y)], early_stopping_rounds=3)

            if forest:
                assert cls.best_ntree_limit == rounds * forest
            else:
                assert cls.best_ntree_limit == 0

            # best_ntree_limit is used by default,
            # assert that under gblinear it's
            # automatically ignored due to being 0.
            cls.predict(X)
Example #6
0
    def test_XGBClassifier_resume(self):
        self._init_ray()

        from sklearn.datasets import load_breast_cancer
        from sklearn.metrics import log_loss

        with TemporaryDirectory() as tempdir:
            model1_path = os.path.join(tempdir, "test_XGBClassifier.model")
            model1_booster_path = os.path.join(tempdir,
                                               "test_XGBClassifier.booster")

            X, Y = load_breast_cancer(return_X_y=True)

            model1 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model1.fit(X, Y)

            pred1 = model1.predict(X)
            log_loss1 = log_loss(pred1, Y)

            # file name of stored xgb model
            model1.save_model(model1_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2

            # file name of 'Booster' instance Xgb model
            model1.get_booster().save_model(model1_booster_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_booster_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2
Example #7
0
    def save_load_model(self, model_path):
        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(use_label_encoder=False).fit(
                X[train_index], y[train_index])
            xgb_model.save_model(model_path)

            xgb_model = RayXGBClassifier()
            xgb_model.load_model(model_path)

            assert xgb_model.use_label_encoder is False
            assert isinstance(xgb_model.classes_, np.ndarray)
            assert isinstance(xgb_model._Booster, xgb.Booster)

            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
            assert xgb_model.get_booster().attr("scikit_learn") is None

            # test native booster
            preds = xgb_model.predict(X[test_index], output_margin=True)
            booster = xgb.Booster(model_file=model_path)
            predt_1 = booster.predict(
                xgb.DMatrix(X[test_index]), output_margin=True)
            assert np.allclose(preds, predt_1)

            with self.assertRaises(TypeError):
                xgb_model = xgb.XGBModel()
                xgb_model.load_model(model_path)
Example #8
0
    def test_sklearn_api_gblinear(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        iris = load_iris()
        tr_d, te_d, tr_l, te_l = train_test_split(
            iris.data, iris.target, train_size=120)

        classifier = RayXGBClassifier(
            booster="gblinear", n_estimators=100, random_state=self.seed)
        classifier.fit(tr_d, tr_l)

        preds = classifier.predict(te_d)
        labels = te_l
        err = (sum([1 for p, l in zip(preds, labels)
                    if p != l]) * 1.0 / len(te_l))
        assert err < 0.5
Example #9
0
    def test_save_load_model(self):
        self._init_ray()

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model")
            self.save_load_model(model_path)

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            self.save_load_model(model_path)

        from sklearn.datasets import load_digits

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            digits = load_digits(n_class=2)
            y = digits["target"]
            X = digits["data"]
            booster = xgb.train(
                {
                    "tree_method": "hist",
                    "objective": "binary:logistic"
                },
                dtrain=xgb.DMatrix(X, y),
                num_boost_round=4,
            )
            predt_0 = booster.predict(xgb.DMatrix(X))
            booster.save_model(model_path)
            cls = RayXGBClassifier()
            cls.load_model(model_path)

            proba = cls.predict_proba(X)
            assert proba.shape[0] == X.shape[0]
            assert proba.shape[1] == 2  # binary

            predt_1 = cls.predict_proba(X)[:, 1]
            assert np.allclose(predt_0, predt_1)

            cls = xgb.XGBModel()
            cls.load_model(model_path)
            predt_1 = cls.predict(X)
            assert np.allclose(predt_0, predt_1)
Example #10
0
    def test_classification_with_custom_objective(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(objective=logregobj)
            xgb_model.fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1

        # Test that the custom objective function is actually used
        class XGBCustomObjectiveException(Exception):
            pass

        def dummy_objective(y_true, y_preds):
            raise XGBCustomObjectiveException()

        xgb_model = RayXGBClassifier(objective=dummy_objective)
        # TODO figure out how to assertRaises XGBCustomObjectiveException
        with self.assertRaises(RuntimeError):
            xgb_model.fit(X, y)