Esempio n. 1
0
def test_gbm_regressor_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingRegressor

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend,
                 random_state=1234,
                 n_gpus=n_gpus(),
                 n_jobs=-1)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from h2o4gpu.ensemble import GradientBoostingRegressorSklearn
    gbm_sk = GradientBoostingRegressorSklearn(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        print(
            (a == b
             for a, b in zip(gbm.staged_predict(X), gbm_sk.staged_predict(X))))
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Esempio n. 2
0
class GradientBoostingRegressor(object):
    """H2O GradientBoostingRegressor Solver

    Selects between h2o4gpu.solvers.xgboost.GradientBoostingRegressor
    and h2o4gpu.ensemble.gradient_boosting.GradientBoostingRegressorSklearn
    Documentation:
    import h2o4gpu.solvers ; help(h2o4gpu.xgboost.GradientBoostingRegressorO)
    help(h2o4gpu.ensemble.gradient_boosting.GradientBoostingRegressorSklearn)

    :param: backend : Which backend to use.  Options are 'auto', 'sklearn',
        'h2o4gpu'.  Default is 'auto'.
        Saves as attribute for actual backend used.

    """
    def __init__(
            self,
            loss='ls',
            learning_rate=0.1,  # h2o4gpu
            n_estimators=100,  # h2o4gpu
            subsample=1.0,  # h2o4gpu
            criterion='friedman_mse',
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_depth=3,  # h2o4gpu
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            init=None,
            random_state=None,  # h2o4gpu
            max_features='auto',
            alpha=0.9,
            verbose=0,  # h2o4gpu
            max_leaf_nodes=None,
            warm_start=False,
            presort='auto',
            # XGBoost specific params
            colsample_bytree=1.0,  # h2o4gpu
            num_parallel_tree=100,  # h2o4gpu
            tree_method='gpu_hist',  # h2o4gpu
            n_gpus=-1,  # h2o4gpu
            predictor='gpu_predictor',  # h2o4gpu
            backend='auto'):  # h2o4gpu
        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend
        from ..typecheck.typechecks import assert_is_type
        assert_is_type(backend, str)

        # Fall back to Sklearn
        # Can remove if fully implement sklearn functionality
        self.do_sklearn = False
        if backend == 'auto':

            params_string = [
                'loss', 'criterion', 'min_samples_split', 'min_samples_leaf',
                'min_weight_fraction_leaf', 'min_impurity_decrease',
                'min_impurity_split', 'init', 'max_features', 'alpha',
                'max_leaf_nodes', 'presort'
            ]
            params = [
                loss, criterion, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, min_impurity_decrease,
                min_impurity_split, init, max_features, alpha, max_leaf_nodes,
                presort
            ]
            params_default = [
                'ls', 'friedman-mse', 2, 1, 0.0, 0.0, None, None, 'auto', 0.9,
                None, 'auto'
            ]

            i = 0
            for param in params:
                if param != params_default[i]:
                    self.do_sklearn = True
                    if verbose > 0:
                        print("WARNING: The sklearn parameter " +
                              params_string[i] +
                              " has been changed from default to " +
                              str(param) +
                              ". Will run Sklearn GradientBoostingRegressor.")
                    self.do_sklearn = True
                i = i + 1
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        self.backend = backend

        from h2o4gpu.ensemble import GradientBoostingRegressorSklearn
        self.model_sklearn = GradientBoostingRegressorSklearn(
            loss=loss,
            learning_rate=learning_rate,  # h2o4gpu
            n_estimators=n_estimators,  # h2o4gpu
            subsample=subsample,  # h2o4gpu
            criterion=criterion,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_depth=max_depth,  # h2o4gpu
            min_impurity_decrease=min_impurity_decrease,
            min_impurity_split=min_impurity_split,
            init=init,
            random_state=random_state,  # h2o4gpu
            max_features=max_features,
            alpha=alpha,
            verbose=verbose,  # h2o4gpu
            max_leaf_nodes=max_leaf_nodes,
            warm_start=warm_start,
            presort=presort)  # h2o4gpu)

        # Parameters for gbm
        silent = False
        if verbose != 0:
            silent = True
        if random_state is None:
            random_state = 0

        import xgboost as xgb
        self.model_h2o4gpu = xgb.XGBClassifier(
            learning_rate=learning_rate,  # h2o4gpu
            n_estimators=n_estimators,  # h2o4gpu
            subsample=subsample,  # h2o4gpu
            max_depth=max_depth,  # h2o4gpu
            random_state=random_state,  # h2o4gpu
            silent=silent,  # h2o4gpu
            colsample_bytree=colsample_bytree,  # h2o4gpu
            num_parallel_tree=num_parallel_tree,  # h2o4gpu
            tree_method=tree_method,  # h2o4gpu
            n_gpus=n_gpus,  # h2o4gpu
            predictor=predictor,  # h2o4gpu
            backend=backend)  # h2o4gpu

        if self.do_sklearn:
            print("Running sklearn GradientBoostingRegressor")
            self.model = self.model_sklearn
        else:
            print("Running h2o4gpu GradientBoostingRegressor")
            self.model = self.model_h2o4gpu

    def apply(self, X):
        print("WARNING: apply() is using sklearn")
        return self.model_sklearn.apply(X)

    def fit(self, X, y=None, sample_weight=None):
        res = self.model.fit(X, y, sample_weight)
        self.set_attributes()
        return res

    def get_params(self):
        return self.model.get_params()

    def predict(self, X):
        if self.do_sklearn:
            res = self.model.predict(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        res[res < 0.5] = 0
        res[res > 0.5] = 1
        self.set_attributes()
        return res.squeeze()

    def score(self, X, y, sample_weight=None):
        # TODO add for h2o4gpu
        print("WARNING: score() is using sklearn")
        if not self.do_sklearn:
            self.model_sklearn.fit(X, y)  # Need to re-fit
        res = self.model_sklearn.score(X, y, sample_weight)
        return res

    def set_params(self, **params):
        return self.model.set_params(**params)

    def staged_predict(self, X):
        print("WARNING: staged_predict() is using sklearn")
        return self.model_sklearn.staged_predict(X)

    def set_attributes(self):
        """ Set attributes for class"""
        from ..solvers.utils import _setter
        s = _setter(oself=self, e1=NameError, e2=AttributeError)

        s('oself.feature_importances_ = oself.model.feature_importances_')
        s('oself.oob_improvement_ = oself.model.oob_improvement_')
        s('oself.train_score_ = oself.model.train_score_')
        s('oself.loss_ = oself.model.loss_')
        s('oself.init = oself.model.init')
        s('oself.estimators_ = oself.model.estimators_')
Esempio n. 3
0
class GradientBoostingRegressor(object):
    """H2O GradientBoostingRegressor Solver

    Selects between h2o4gpu.solvers.xgboost.GradientBoostingRegressor
    and h2o4gpu.ensemble.gradient_boosting.GradientBoostingRegressorSklearn
    Documentation:
    import h2o4gpu.solvers ; help(h2o4gpu.xgboost.GradientBoostingRegressorO)
    help(h2o4gpu.ensemble.gradient_boosting.GradientBoostingRegressorSklearn)

    :param: backend : Which backend to use.  Options are 'auto', 'sklearn',
        'h2o4gpu'.  Default is 'auto'.
        Saves as attribute for actual backend used.

    """

    def __init__(
            self,
            loss='ls',
            learning_rate=0.1,  # h2o4gpu
            n_estimators=100,  # h2o4gpu
            subsample=1.0,  # h2o4gpu
            criterion='friedman_mse',
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_depth=3,  # h2o4gpu
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            init=None,
            random_state=None,  # h2o4gpu
            max_features='auto',
            alpha=0.9,
            verbose=0,  # h2o4gpu
            max_leaf_nodes=None,
            warm_start=False,
            presort='auto',
            # XGBoost specific params
            colsample_bytree=1.0,  # h2o4gpu
            num_parallel_tree=100,  # h2o4gpu
            tree_method='gpu_hist',  # h2o4gpu
            n_gpus=-1,  # h2o4gpu
            predictor='gpu_predictor',  # h2o4gpu
            backend='auto'):  # h2o4gpu
        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend
        from ..typecheck.typechecks import assert_is_type
        assert_is_type(backend, str)

        # Fall back to Sklearn
        # Can remove if fully implement sklearn functionality
        self.do_sklearn = False
        if backend == 'auto':

            params_string = [
                'loss', 'criterion', 'min_samples_split', 'min_samples_leaf',
                'min_weight_fraction_leaf', 'min_impurity_decrease',
                'min_impurity_split', 'init', 'max_features', 'alpha',
                'max_leaf_nodes', 'presort'
            ]
            params = [
                loss, criterion, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, min_impurity_decrease,
                min_impurity_split, init, max_features, alpha, max_leaf_nodes,
                presort
            ]
            params_default = [
                'ls', 'friedman-mse', 2, 1, 0.0, 0.0, None, None, 'auto', 0.9,
                None, 'auto'
            ]

            i = 0
            for param in params:
                if param != params_default[i]:
                    self.do_sklearn = True
                    if verbose > 0:
                        print("WARNING: The sklearn parameter " +
                              params_string[i] +
                              " has been changed from default to " + str(param)
                              + ". Will run Sklearn GradientBoostingRegressor.")
                    self.do_sklearn = True
                i = i + 1
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        self.backend = backend

        from h2o4gpu.ensemble import GradientBoostingRegressorSklearn
        self.model_sklearn = GradientBoostingRegressorSklearn(
            loss=loss,
            learning_rate=learning_rate,  # h2o4gpu
            n_estimators=n_estimators,  # h2o4gpu
            subsample=subsample,  # h2o4gpu
            criterion=criterion,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_depth=max_depth,  # h2o4gpu
            min_impurity_decrease=min_impurity_decrease,
            min_impurity_split=min_impurity_split,
            init=init,
            random_state=random_state,  # h2o4gpu
            max_features=max_features,
            alpha=alpha,
            verbose=verbose,  # h2o4gpu
            max_leaf_nodes=max_leaf_nodes,
            warm_start=warm_start,
            presort=presort)  # h2o4gpu)

        # Parameters for gbm
        silent = False
        if verbose != 0:
            silent = True
        if random_state is None:
            random_state = 0

        import xgboost as xgb
        self.model_h2o4gpu = xgb.XGBClassifier(
            learning_rate=learning_rate,  # h2o4gpu
            n_estimators=n_estimators,  # h2o4gpu
            subsample=subsample,  # h2o4gpu
            max_depth=max_depth,  # h2o4gpu
            random_state=random_state,  # h2o4gpu
            silent=silent,  # h2o4gpu
            colsample_bytree=colsample_bytree,  # h2o4gpu
            num_parallel_tree=num_parallel_tree,  # h2o4gpu
            tree_method=tree_method,  # h2o4gpu
            n_gpus=n_gpus,  # h2o4gpu
            predictor=predictor,  # h2o4gpu
            backend=backend)  # h2o4gpu

        if self.do_sklearn:
            print("Running sklearn GradientBoostingRegressor")
            self.model = self.model_sklearn
        else:
            print("Running h2o4gpu GradientBoostingRegressor")
            self.model = self.model_h2o4gpu

    def apply(self, X):
        print("WARNING: apply() is using sklearn")
        return self.model_sklearn.apply(X)

    def fit(self, X, y=None, sample_weight=None):
        res = self.model.fit(X, y, sample_weight)
        self.set_attributes()
        return res

    def get_params(self):
        return self.model.get_params()

    def predict(self, X):
        if self.do_sklearn:
            res = self.model.predict(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        res[res < 0.5] = 0
        res[res > 0.5] = 1
        self.set_attributes()
        return res.squeeze()

    def score(self, X, y, sample_weight=None):
        # TODO add for h2o4gpu
        print("WARNING: score() is using sklearn")
        if not self.do_sklearn:
            self.model_sklearn.fit(X, y)  # Need to re-fit
        res = self.model_sklearn.score(X, y, sample_weight)
        return res

    def set_params(self, **params):
        return self.model.set_params(**params)

    def staged_predict(self, X):
        print("WARNING: staged_predict() is using sklearn")
        return self.model_sklearn.staged_predict(X)

    def set_attributes(self):
        """ Set attributes for class"""
        from ..solvers.utils import _setter
        s = _setter(oself=self, e1=NameError, e2=AttributeError)

        s('oself.feature_importances_ = oself.model.feature_importances_')
        s('oself.oob_improvement_ = oself.model.oob_improvement_')
        s('oself.train_score_ = oself.model.train_score_')
        s('oself.loss_ = oself.model.loss_')
        s('oself.init = oself.model.init')
        s('oself.estimators_ = oself.model.estimators_')