def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data(
            multiclass=False)

        actual = GradientBoostingClassifier(random_state=42)
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        y_actual = actual.predict(X_test)

        config['max_depth'] = max(
            resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2)
        del config['max_depth_factor']

        config['max_leaf_nodes'] = max(
            resolve_factor(config['max_leaf_nodes_factor'], X_train.shape[0]),
            2)
        del config['max_leaf_nodes_factor']

        config['min_samples_leaf'] = resolve_factor(
            config['min_samples_leaf_factor'], X_train.shape[0])
        del config['min_samples_leaf_factor']

        expected = HistGradientBoostingClassifier(**config,
                                                  scoring='f1_weighted',
                                                  random_state=42)
        expected.fit(X_train, y_train)
        y_expected = expected.predict(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'prediction'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(y_actual, y_expected)
    def _run_GB_trees_classifier_converter(self, num_classes, extra_config={}, labels_shift=0):
        warnings.filterwarnings("ignore")
        for max_depth in [2, 3, 8, 10, 12, None]:
            model = HistGradientBoostingClassifier(max_iter=10, max_depth=max_depth)
            X = [[0, 1], [1, 1], [2, 0]]
            X = np.array(X, dtype=np.float32)
            y = np.array([100, -10, 50]) + labels_shift

            model.fit(X, y)
            pytorch_model = hummingbird.ml.convert(model, "pytorch", extra_config=extra_config)
            self.assertTrue(pytorch_model is not None)
            np.testing.assert_allclose(model.predict_proba(X), pytorch_model.predict_proba(X), rtol=1e-06, atol=1e-06)
    def test_float64_GB_trees_classifier_converter(self):
        warnings.filterwarnings("ignore")
        num_classes = 3
        for max_depth in [2, 3, 8, 10, 12, None]:
            model = HistGradientBoostingClassifier(max_iter=10, max_depth=max_depth)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            y = np.random.randint(num_classes, size=100)

            model.fit(X, y)
            torch_model = hummingbird.ml.convert(model, "torch", extra_config={})
            self.assertTrue(torch_model is not None)
            np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06)
    def _run_GB_trees_classifier_converter(self, num_classes, extra_config={}, labels_shift=0):
        warnings.filterwarnings("ignore")
        for max_depth in [2, 3, 8, 10, 12, None]:
            model = HistGradientBoostingClassifier(max_iter=10, max_depth=max_depth)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            X = np.array(X, dtype=np.float32)
            y = np.random.randint(num_classes, size=100) + labels_shift

            model.fit(X, y)
            torch_model = hummingbird.ml.convert(model, "torch", extra_config=extra_config)
            self.assertTrue(torch_model is not None)
            np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06)
 def test_HistGBDT_raises_wrong_type(self):
     warnings.filterwarnings("ignore")
     X = np.random.rand(100, 200)
     X = np.array(X, dtype=np.float32)
     y = np.random.randint(3, size=100).astype(np.float32)  # y must be int, not float, should error
     model = HistGradientBoostingClassifier(max_iter=10).fit(X, y)
     self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "pytorch")
    def test_default(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = GradientBoostingClassifier(random_state=42)
        config = self.get_default(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        y_actual = actual.predict(X_test)

        expected = HistGradientBoostingClassifier(n_iter_no_change=10,
                                                  l2_regularization=1e-10,
                                                  scoring='f1_weighted',
                                                  random_state=42)
        expected.fit(X_train, y_train)
        y_expected = expected.predict(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'prediction'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(y_actual, y_expected)
Example #7
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import HistGradientBoostingClassifier

        if check_none(self.scoring):
            self.scoring = None

        # Heuristic to set the tree depth
        max_depth = resolve_factor(self.max_depth_factor, n_features, cs_default=1.)
        if max_depth is not None:
            max_depth = max(max_depth, 2)

        l2_regularization = 0. if self.l2_regularization == 1e-07 else self.l2_regularization

        # Heuristic to set the tree width
        if isinstance(self.max_leaf_nodes_factor, int):
            max_leaf_nodes = self.max_leaf_nodes_factor
        else:
            max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, default=31, cs_default=1.)
        if max_leaf_nodes is not None:
            max_leaf_nodes = max(max_leaf_nodes, 2)

        # Heuristic to set the tree width
        if isinstance(self.min_samples_leaf_factor, int):
            min_samples_leaf = self.min_samples_leaf_factor
        else:
            min_samples_leaf = resolve_factor(self.min_samples_leaf_factor, n_samples, default=20, cs_default=0.0001)

        n_iter_no_change = None if self.n_iter_no_change == 0 else self.n_iter_no_change

        if self.scoring == 'balanced_accurary':
            self.scoring = 'balanced_accuracy'

        return HistGradientBoostingClassifier(
            loss=self.loss,
            learning_rate=self.learning_rate,
            max_iter=self.max_iter,
            min_samples_leaf=min_samples_leaf,
            max_depth=max_depth,
            max_leaf_nodes=max_leaf_nodes,
            max_bins=self.max_bins,
            l2_regularization=l2_regularization,
            tol=self.tol,
            scoring=self.scoring,
            n_iter_no_change=n_iter_no_change,
            validation_fraction=self.validation_fraction,
            random_state=self.random_state,
        )
                                            learning_rate=0.1,
                                            max_depth=3,
                                            random_state=13))])

sklearn_mapper = ColumnTransformer(
    [(str(cat_index), PMMLLabelBinarizer(sparse_output=False), [cat_index])
     for cat_index in range(0, len(cat_columns))] +
    [(str(cont_index), "passthrough", [cont_index])
     for cont_index in range(len(cat_columns), len(cat_columns + cont_columns))
     ],
    remainder="drop")

sklearn_pipeline = Pipeline([("mapper", sklearn_mapper),
                             ("classifier",
                              HistGradientBoostingClassifier(max_iter=31,
                                                             max_depth=3,
                                                             random_state=13))
                             ])

final_estimator = LogisticRegression(multi_class="ovr", random_state=13)


class DisabledCV:
    def __init__(self):
        self.n_splits = 1

    def split(self, X, y, groups=None):
        yield (numpy.arange(len(X)), numpy.arange(len(y)))

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits