Ejemplo n.º 1
0
    def test_pandas_input(self):
        self._init_ray()

        import pandas as pd
        from sklearn.calibration import CalibratedClassifierCV

        rng = np.random.RandomState(self.seed)

        kRows = 100
        kCols = 6

        X = rng.randint(low=0, high=2, size=kRows * kCols)
        X = X.reshape(kRows, kCols)

        df = pd.DataFrame(X)
        feature_names = []
        for i in range(1, kCols):
            feature_names += ["k" + str(i)]

        df.columns = ["status"] + feature_names

        target = df["status"]
        train = df.drop(columns=["status"])
        model = RayXGBClassifier()
        model.fit(train, target)
        clf_isotonic = CalibratedClassifierCV(
            model, cv="prefit", method="isotonic")
        clf_isotonic.fit(train, target)
        assert isinstance(
            clf_isotonic.calibrated_classifiers_[0].base_estimator,
            RayXGBClassifier,
        )
        self.assertTrue(
            np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1])))
Ejemplo n.º 2
0
    def test_sklearn_clone(self):
        self._init_ray()

        from sklearn.base import clone

        clf = RayXGBClassifier(n_jobs=2)
        clf.n_jobs = -1
        clone(clf)
Ejemplo n.º 3
0
    def test_select_feature(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.feature_selection import SelectFromModel

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        cls = RayXGBClassifier()
        cls.fit(X, y)
        selector = SelectFromModel(cls, prefit=True, max_features=1)
        X_selected = selector.transform(X)
        assert X_selected.shape[1] == 1
Ejemplo n.º 4
0
    def test_sklearn_random_state(self):
        self._init_ray()

        clf = RayXGBClassifier(random_state=402)
        assert clf.get_xgb_params()["random_state"] == 402

        clf = RayXGBClassifier(random_state=401)
        assert clf.get_xgb_params()["random_state"] == 401

        random_state = np.random.RandomState(seed=403)
        clf = RayXGBClassifier(random_state=random_state)
        assert isinstance(clf.get_xgb_params()["random_state"], int)
Ejemplo n.º 5
0
    def test_kwargs_error(self):
        self._init_ray()

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        with self.assertRaises(TypeError):
            clf = RayXGBClassifier(n_jobs=1000, **params)
            assert isinstance(clf, RayXGBClassifier)
Ejemplo n.º 6
0
    def test_stacking_classification(self):
        self._init_ray()

        from sklearn.model_selection import train_test_split
        from sklearn.datasets import load_iris
        from sklearn.svm import LinearSVC
        from sklearn.linear_model import LogisticRegression
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline
        from sklearn.ensemble import StackingClassifier

        X, y = load_iris(return_X_y=True)
        estimators = [
            ("gbm", RayXGBClassifier()),
            (
                "svr",
                make_pipeline(StandardScaler(), LinearSVC(random_state=42)),
            ),
        ]
        clf = StackingClassifier(
            estimators=estimators, final_estimator=LogisticRegression())

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=42)
        clf.fit(X_train, y_train).score(X_test, y_test)
Ejemplo n.º 7
0
        def train(booster, forest):
            rounds = 4
            cls = RayXGBClassifier(
                n_estimators=rounds, num_parallel_tree=forest,
                booster=booster).fit(
                    X, y, eval_set=[(X, y)], early_stopping_rounds=3)

            if forest:
                assert cls.best_ntree_limit == rounds * forest
            else:
                assert cls.best_ntree_limit == 0

            # best_ntree_limit is used by default,
            # assert that under gblinear it's
            # automatically ignored due to being 0.
            cls.predict(X)
Ejemplo n.º 8
0
    def test_validation_weights_xgbclassifier(self):
        self._init_ray()

        from sklearn.datasets import make_hastie_10_2

        # prepare training and test data
        X, y = make_hastie_10_2(n_samples=2000, random_state=42)
        labels, y = np.unique(y, return_inverse=True)
        X_train, X_test = X[:1600], X[1600:]
        y_train, y_test = y[:1600], y[1600:]

        # instantiate model
        param_dist = {
            "objective": "binary:logistic",
            "n_estimators": 2,
            "random_state": 123,
        }
        clf = RayXGBClassifier(**param_dist)

        # train it using instance weights only in the training set
        weights_train = np.random.choice([1, 2], len(X_train))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric="logloss",
            verbose=False,
        )

        # evaluate logloss metric on test set *without* using weights
        evals_result_without_weights = clf.evals_result()
        logloss_without_weights = evals_result_without_weights["validation_0"][
            "logloss"]

        # now use weights for the test set
        np.random.seed(0)
        weights_test = np.random.choice([1, 2], len(X_test))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            sample_weight_eval_set=[weights_test],
            eval_metric="logloss",
            verbose=False,
        )
        evals_result_with_weights = clf.evals_result()
        logloss_with_weights = evals_result_with_weights["validation_0"][
            "logloss"]

        # check that the logloss in the test set is actually different
        # when using weights than when not using them
        assert all((logloss_with_weights[i] != logloss_without_weights[i]
                    for i in [0, 1]))
Ejemplo n.º 9
0
    def test_sklearn_api_gblinear(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        iris = load_iris()
        tr_d, te_d, tr_l, te_l = train_test_split(
            iris.data, iris.target, train_size=120)

        classifier = RayXGBClassifier(
            booster="gblinear", n_estimators=100, random_state=self.seed)
        classifier.fit(tr_d, tr_l)

        preds = classifier.predict(te_d)
        labels = te_l
        err = (sum([1 for p, l in zip(preds, labels)
                    if p != l]) * 1.0 / len(te_l))
        assert err < 0.5
Ejemplo n.º 10
0
    def testClassifierNoLabelEncoder(self, n_class=2):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits = load_digits(n_class=n_class)
        y = digits["target"]
        X = digits["data"]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5)

        train_matrix = RayDMatrix(X_train, y_train)
        test_matrix = RayDMatrix(X_test, y_test)

        with self.assertRaisesRegex(Exception, "num_class"):
            RayXGBClassifier(**self.params).fit(train_matrix, None)

        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
            RayXGBClassifier(**self.params).fit(train_matrix,
                                                None,
                                                eval_set=[(X_test, y_test)])

        with self.assertRaisesRegex(Exception,
                                    r"must be \(array_like, array_like\)"):
            RayXGBClassifier(**self.params).fit(X_train,
                                                y_train,
                                                eval_set=[(test_matrix, "eval")
                                                          ])

        RayXGBClassifier(num_class=n_class,
                         **self.params).fit(train_matrix, None)

        clf = RayXGBClassifier(num_class=n_class,
                               **self.params).fit(train_matrix,
                                                  None,
                                                  eval_set=[(test_matrix,
                                                             "eval")])

        clf.predict(test_matrix)
        clf.predict_proba(test_matrix)
Ejemplo n.º 11
0
    def test_kwargs_grid_search(self):
        self._init_ray()

        from sklearn.model_selection import GridSearchCV
        from sklearn import datasets

        params = {"tree_method": "hist"}
        clf = RayXGBClassifier(n_estimators=1, learning_rate=1.0, **params)
        assert clf.get_params()["tree_method"] == "hist"
        # 'max_leaves' is not a default argument of XGBClassifier
        # Check we can still do grid search over this parameter
        search_params = {"max_leaves": range(2, 5)}
        grid_cv = GridSearchCV(clf, search_params, cv=5)
        iris = datasets.load_iris()
        grid_cv.fit(iris.data, iris.target)

        # Expect unique results for each parameter value
        # This confirms sklearn is able to successfully update the parameter
        means = grid_cv.cv_results_["mean_test_score"]
        assert len(means) == len(set(means))
Ejemplo n.º 12
0
    def test_sklearn_n_jobs(self):
        self._init_ray()

        clf = RayXGBClassifier(n_jobs=1)
        assert clf.get_xgb_params()["n_jobs"] == 1

        clf = RayXGBClassifier(n_jobs=2)
        assert clf.get_xgb_params()["n_jobs"] == 2
Ejemplo n.º 13
0
    def test_save_load_model(self):
        self._init_ray()

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model")
            self.save_load_model(model_path)

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            self.save_load_model(model_path)

        from sklearn.datasets import load_digits

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            digits = load_digits(n_class=2)
            y = digits["target"]
            X = digits["data"]
            booster = xgb.train(
                {
                    "tree_method": "hist",
                    "objective": "binary:logistic"
                },
                dtrain=xgb.DMatrix(X, y),
                num_boost_round=4,
            )
            predt_0 = booster.predict(xgb.DMatrix(X))
            booster.save_model(model_path)
            cls = RayXGBClassifier()
            cls.load_model(model_path)

            proba = cls.predict_proba(X)
            assert proba.shape[0] == X.shape[0]
            assert proba.shape[1] == 2  # binary

            predt_1 = cls.predict_proba(X)[:, 1]
            assert np.allclose(predt_0, predt_1)

            cls = xgb.XGBModel()
            cls.load_model(model_path)
            predt_1 = cls.predict(X)
            assert np.allclose(predt_0, predt_1)
Ejemplo n.º 14
0
    def test_sklearn_get_default_params(self):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits_2class = load_digits(n_class=2)
        X = digits_2class["data"]
        y = digits_2class["target"]
        cls = RayXGBClassifier()
        assert cls.get_params()["base_score"] is None
        cls.fit(X[:4, ...], y[:4, ...])
        assert cls.get_params()["base_score"] is not None
Ejemplo n.º 15
0
    def test_classification_with_custom_objective(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(objective=logregobj)
            xgb_model.fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1

        # Test that the custom objective function is actually used
        class XGBCustomObjectiveException(Exception):
            pass

        def dummy_objective(y_true, y_preds):
            raise XGBCustomObjectiveException()

        xgb_model = RayXGBClassifier(objective=dummy_objective)
        # TODO figure out how to assertRaises XGBCustomObjectiveException
        with self.assertRaises(RuntimeError):
            xgb_model.fit(X, y)
Ejemplo n.º 16
0
    def test_estimator_type(self):
        self._init_ray()

        assert RayXGBClassifier._estimator_type == "classifier"
        assert RayXGBRFClassifier._estimator_type == "classifier"
        assert RayXGBRegressor._estimator_type == "regressor"
        assert RayXGBRFRegressor._estimator_type == "regressor"
        assert RayXGBRanker._estimator_type == "ranker"

        from sklearn.datasets import load_digits

        X, y = load_digits(n_class=2, return_X_y=True)
        cls = RayXGBClassifier(n_estimators=2).fit(X, y)
        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "cls.json")
            cls.save_model(path)

            reg = RayXGBRegressor()
            with self.assertRaises(TypeError):
                reg.load_model(path)

            cls = RayXGBClassifier()
            cls.load_model(path)  # no error
Ejemplo n.º 17
0
    def test_XGBClassifier_resume(self):
        self._init_ray()

        from sklearn.datasets import load_breast_cancer
        from sklearn.metrics import log_loss

        with TemporaryDirectory() as tempdir:
            model1_path = os.path.join(tempdir, "test_XGBClassifier.model")
            model1_booster_path = os.path.join(tempdir,
                                               "test_XGBClassifier.booster")

            X, Y = load_breast_cancer(return_X_y=True)

            model1 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model1.fit(X, Y)

            pred1 = model1.predict(X)
            log_loss1 = log_loss(pred1, Y)

            # file name of stored xgb model
            model1.save_model(model1_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2

            # file name of 'Booster' instance Xgb model
            model1.get_booster().save_model(model1_booster_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_booster_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2
Ejemplo n.º 18
0
    def save_load_model(self, model_path):
        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(use_label_encoder=False).fit(
                X[train_index], y[train_index])
            xgb_model.save_model(model_path)

            xgb_model = RayXGBClassifier()
            xgb_model.load_model(model_path)

            assert xgb_model.use_label_encoder is False
            assert isinstance(xgb_model.classes_, np.ndarray)
            assert isinstance(xgb_model._Booster, xgb.Booster)

            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
            assert xgb_model.get_booster().attr("scikit_learn") is None

            # test native booster
            preds = xgb_model.predict(X[test_index], output_margin=True)
            booster = xgb.Booster(model_file=model_path)
            predt_1 = booster.predict(
                xgb.DMatrix(X[test_index]), output_margin=True)
            assert np.allclose(preds, predt_1)

            with self.assertRaises(TypeError):
                xgb_model = xgb.XGBModel()
                xgb_model.load_model(model_path)
Ejemplo n.º 19
0
    def test_multiclass_classification(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import KFold

        def check_pred(preds, labels, output_margin):
            if output_margin:
                err = sum(1 for i in range(len(preds))
                          if preds[i].argmax() != labels[i]) / float(
                              len(preds))
            else:
                err = sum(1 for i in range(len(preds))
                          if preds[i] != labels[i]) / float(len(preds))
            assert err < 0.4

        iris = load_iris()
        y = iris["target"]
        X = iris["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index])
            if hasattr(xgb_model.get_booster(), "num_boosted_rounds"):
                assert (xgb_model.get_booster().num_boosted_rounds() ==
                        xgb_model.n_estimators)
            preds = xgb_model.predict(X[test_index])
            # test other params in XGBClassifier().fit
            preds2 = xgb_model.predict(
                X[test_index], output_margin=True, ntree_limit=3)
            preds3 = xgb_model.predict(
                X[test_index], output_margin=True, ntree_limit=0)
            preds4 = xgb_model.predict(
                X[test_index], output_margin=False, ntree_limit=3)
            labels = y[test_index]

            check_pred(preds, labels, output_margin=False)
            check_pred(preds2, labels, output_margin=True)
            check_pred(preds3, labels, output_margin=True)
            check_pred(preds4, labels, output_margin=False)

        cls = RayXGBClassifier(n_estimators=4).fit(X, y)
        assert cls.n_classes_ == 3
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_

        # custom objective, the default is multi:softprob
        # so no transformation is required.
        cls = RayXGBClassifier(
            n_estimators=4, objective=softprob_obj(3)).fit(X, y)
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_
Ejemplo n.º 20
0
    def test_parameters_access(self):
        self._init_ray()

        from sklearn import datasets

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        clf = RayXGBClassifier(n_estimators=1000, **params)
        assert clf.get_params()["updater"] == "grow_gpu_hist"
        assert clf.get_params()["subsample"] == 0.5
        assert clf.get_params()["n_estimators"] == 1000

        clf = RayXGBClassifier(n_estimators=1, nthread=4)
        X, y = datasets.load_iris(return_X_y=True)
        clf.fit(X, y)

        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 4

        clf.set_params(nthread=16)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16

        clf.predict(X)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16
Ejemplo n.º 21
0
    def run_boost_from_prediction(self, tree_method):
        from sklearn.datasets import load_breast_cancer

        X, y = load_breast_cancer(return_X_y=True)
        model_0 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_0.fit(X=X, y=y)
        margin = model_0.predict(X, output_margin=True)

        model_1 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_1.fit(X=X, y=y, base_margin=margin)
        predictions_1 = model_1.predict(X, base_margin=margin)

        cls_2 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=8,
            tree_method=tree_method,
        )
        cls_2.fit(X=X, y=y)
        predictions_2 = cls_2.predict(X)
        assert np.all(predictions_1 == predictions_2)