Example #1
0
def test_bin_mapper_idempotence(n_bins_small, n_bins_large):
    assert n_bins_large >= n_bins_small
    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
    mapper_small = BinMapper(max_bins=n_bins_small)
    mapper_large = BinMapper(max_bins=n_bins_large)
    binned_small = mapper_small.fit_transform(data)
    binned_large = mapper_large.fit_transform(binned_small)
    assert_array_equal(binned_small, binned_large)
Example #2
0
def test_pre_binned_data():
    # Make sure ValueError is raised when predictor.predict() is called while
    # the predictor does not have any numerical thresholds.

    X, y = make_regression()

    # Init gradients and hessians to that of least squares loss
    gradients = -y.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)
    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()
    predictor = grower.make_predictor(numerical_thresholds=None)

    assert_raises_regex(ValueError,
                        'This predictor does not have numerical thresholds',
                        predictor.predict, X)

    assert_raises_regex(ValueError, 'binned_data dtype should be uint8',
                        predictor.predict_binned, X)

    predictor.predict_binned(X_binned)  # No error

    predictor = grower.make_predictor(
        numerical_thresholds=mapper.numerical_thresholds_)
    assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict,
                        X_binned)
Example #3
0
def test_boston_dataset():
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=42)

    mapper = BinMapper(random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    gradients = y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.75
    assert r2_score(y_test, predictor.predict(X_test)) > 0.65
Example #4
0
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
                          constant_hessian, noise):
    rng = np.random.RandomState(seed=0)
    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    if noise:
        y_scale = y.std()
        y += rng.normal(scale=noise, size=n_samples) * y_scale
    mapper = BinMapper(max_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(np.float32)
    if constant_hessian:
        all_hessians = np.ones(shape=1, dtype=np.float32)
    else:
        all_hessians = np.ones_like(all_gradients)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    if n_samples >= min_samples_leaf:
        for node in predictor.nodes:
            if node['is_leaf']:
                assert node['count'] >= min_samples_leaf
    else:
        assert predictor.nodes.shape[0] == 1
        assert predictor.nodes[0]['is_leaf']
        assert predictor.nodes[0]['count'] == n_samples
Example #5
0
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
    # Make sure root node isn't split if n_samples is not at least twice
    # min_samples_leaf
    rng = np.random.RandomState(seed=0)

    max_bins = 255

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = BinMapper(max_bins=max_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(np.float32)
    all_hessians = np.ones(shape=1, dtype=np.float32)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=max_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    if n_samples >= min_samples_leaf * 2:
        assert len(grower.finalized_leaves) >= 2
    else:
        assert len(grower.finalized_leaves) == 1
Example #6
0
def test_boston_dataset(max_bins):
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(
        boston.data, boston.target, random_state=42)

    mapper = BinMapper(max_bins=max_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned, gradients, hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes, max_bins=max_bins,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
Example #7
0
def test_bin_mapper_small_random_data(n_samples, n_bins):
    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
    assert len(np.unique(data)) == n_samples

    mapper = BinMapper(max_bins=n_bins, random_state=42)
    binned = mapper.fit_transform(data)

    assert binned.shape == data.shape
    assert binned.dtype == np.uint8
    assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
                       np.arange(n_samples))
Example #8
0
def test_bin_mapper_repeated_values_invariance(n_distinct):
    rng = np.random.RandomState(42)
    distinct_values = rng.normal(size=n_distinct)
    assert len(np.unique(distinct_values)) == n_distinct

    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
    data = distinct_values[repeated_indices]
    rng.shuffle(data)
    assert_array_equal(np.unique(data), np.sort(distinct_values))

    data = data.reshape(-1, 1)

    mapper_1 = BinMapper(max_bins=n_distinct)
    binned_1 = mapper_1.fit_transform(data)
    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))

    # Adding more bins to the mapper yields the same results (same thresholds)
    mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3))
    binned_2 = mapper_2.fit_transform(data)

    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
    assert_array_equal(binned_1, binned_2)
def test_pre_binned_data():
    # Make sure that:
    # - training on numerical data and predicting on numerical data is the
    #   same as training on binned data and predicting on binned data
    # - training on numerical data and predicting on numerical data is the
    #   same as training on numerical data and predicting on binned data
    # - training on binned data and predicting on numerical data is not
    #   possible.

    X, y = make_regression(random_state=0)
    gbdt = GradientBoostingRegressor(scoring=None, random_state=0)
    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)

    fit_num_pred_num = gbdt.fit(X, y).predict(X)
    fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned)
    fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned)

    assert_allclose(fit_num_pred_num, fit_binned_pred_binned)
    assert_allclose(fit_num_pred_num, fit_num_pred_binned)

    assert_raises_regex(ValueError,
                        'This estimator was fitted with pre-binned data ',
                        gbdt.fit(X_binned, y).predict, X)
from pygbm.binning import BinMapper
from pygbm.grower import TreeGrower
from pygbm import plotting

rng = np.random.RandomState(0)

n_samples = int(1e7)
n_leaf_nodes = 5
X, y = make_classification(n_samples=n_samples,
                           n_classes=2,
                           n_features=5,
                           n_informative=3,
                           n_redundant=0,
                           random_state=rng)

bin_mapper_ = BinMapper(random_state=rng)
X_binned = bin_mapper_.fit_transform(X)

gradients = np.asarray(y, dtype=np.float32).copy()
hessians = np.ones(1, dtype=np.float32)

# First run to trigger the compilation of numba jit methods to avoid recording
# the compiler overhead in the profile report.
TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes).grow()

# New run with to collect timing statistics that will be included in the plot.
grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes)
grower.grow()
plotting.plot_tree(grower)
Example #11
0
class BaseGradientBoostingMachine(BaseEstimator, ABC):
    """Base class for gradient boosting estimators."""

    @abstractmethod
    def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
                 max_depth, min_samples_leaf, l2_regularization, max_bins,
                 scoring, validation_split, n_iter_no_change, tol, verbose,
                 random_state):
        self.loss = loss
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.l2_regularization = l2_regularization
        self.max_bins = max_bins
        self.n_iter_no_change = n_iter_no_change
        self.validation_split = validation_split
        self.scoring = scoring
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

    def _validate_parameters(self):
        """Validate parameters passed to __init__.

        The parameters that are directly passed to the grower are checked in
        TreeGrower."""

        if self.loss not in self._VALID_LOSSES:
            raise ValueError(
                "Loss {} is not supported for {}. Accepted losses"
                "are {}.".format(self.loss, self.__class__.__name__,
                                 ', '.join(self._VALID_LOSSES)))

        if self.learning_rate <= 0:
            raise ValueError(f'learning_rate={self.learning_rate} must '
                             f'be strictly positive')
        if self.max_iter < 1:
            raise ValueError(f'max_iter={self.max_iter} must '
                             f'not be smaller than 1.')
        if self.n_iter_no_change < 2:
            raise ValueError(f'n_iter_no_change={self.n_iter_no_change} '
                             f'must not be smaller than 2.')
        if self.validation_split is not None and self.validation_split <= 0:
            raise ValueError(f'validation_split={self.validation_split} '
                             f'must be strictly positive, or None.')
        if self.tol is not None and self.tol < 0:
            raise ValueError(f'tol={self.tol} '
                             f'must not be smaller than 0.')

    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        # TODO: test input checking
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.'
            )
        rng = check_random_state(self.random_state)

        self._validate_parameters()
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        if self.scoring is not None and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned, y, test_size=self.validation_split,
                stratify=stratify, random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.'
                )
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(
                np.arange(X_binned_train.shape[0]), subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        # values predicted by the trees. Used as-is in regression, and
        # transformed into probas and / or classes for classification
        raw_predictions = np.zeros(
            shape=(n_samples, self.n_trees_per_iteration_),
            dtype=y_train.dtype
        )
        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            n_trees_per_iteration=self.n_trees_per_iteration_
        )
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        scorer = check_scoring(self, self.scoring)
        self.train_scores_ = []
        if self.scoring is not None:
            # Add predictions of the initial model (before the first tree)
            predicted_train = self._predict_binned(X_binned_train)
            score_train = scorer._sign * scorer._score_func(y_train,
                                                            predicted_train)
            self.train_scores_.append(score_train)

            if self.validation_split is not None:
                self.validation_scores_ = []
                predicted_val = self._predict_binned(X_binned_val)
                score_val = scorer._sign * scorer._score_func(y_val,
                                                              predicted_val)
                self.validation_scores_.append(score_val)

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ", end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(zip(
                    np.array_split(gradients, self.n_trees_per_iteration_),
                    np.array_split(hessians, self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(
                    X_binned_train, gradients_at_k, hessians_at_k,
                    max_bins=self.max_bins,
                    n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted
                leaves_data = [(l.value, l.sample_indices)
                               for l in grower.finalized_leaves]
                _update_raw_predictions(leaves_data, raw_predictions[:, k])
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_stop = self._check_early_stopping(
                scorer, X_binned_small_train, y_small_train,
                X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time)

            if should_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        if self.scoring is not None and self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self

    def _check_early_stopping(self, scorer, X_binned_train, y_train,
                              X_binned_val, y_val):
        """Check if fitting should be early-stopped.

        Return True (do early stopping) if the score at iteration i hasn't
        improved any of the last n_iter_no_change scores by at least tol
        percent.

        Scores are computed on validation data or on training data.
        """

        if self.scoring is None:
            # no early stopping.
            # In sklearn early stopping is not done if n_iter_no_change is None
            return False

        def _should_stop(scores):
            if len(scores) - 1 < self.n_iter_no_change:
                # - 1 because scores[0] is for the init model before the first
                # tree.
                return False

            current_score = scores[-1]  # score at current iteration
            previous_scores = scores[-self.n_iter_no_change:-1]
            return all(
                current_score < prev_score * (1 + self.tol * scorer._sign)
                for prev_score in previous_scores
            )

        # TODO: make sure that self.predict can work on binned data and
        # then only use the public scorer.__call__.
        predicted_train = self._predict_binned(X_binned_train)
        score_train = scorer._sign * scorer._score_func(y_train,
                                                        predicted_train)
        self.train_scores_.append(score_train)

        if self.validation_split is not None:
            predicted_val = self._predict_binned(X_binned_val)
            score_val = scorer._sign * scorer._score_func(y_val,
                                                          predicted_val)
            self.validation_scores_.append(score_val)
            return _should_stop(self.validation_scores_)

        return _should_stop(self.train_scores_)

    def _print_iteration_stats(self, iteration_start_time):
        """Print info about the current fitting iteration."""
        log_msg = ''

        predictors_of_ith_iteration = [
            predictors_list for predictors_list in self.predictors_[-1]
            if predictors_list
        ]
        n_trees = len(predictors_of_ith_iteration)
        max_depth = max(predictor.get_max_depth()
                        for predictor in predictors_of_ith_iteration)
        n_leaves = sum(predictor.get_n_leaf_nodes()
                       for predictor in predictors_of_ith_iteration)

        if n_trees == 1:
            log_msg += (f"{n_trees} tree, {n_leaves} leaves, ")
        else:
            log_msg += (f"{n_trees} trees, {n_leaves} leaves ")
            log_msg += (f"({int(n_leaves / n_trees)} on avg), ")

        log_msg += f"max depth = {max_depth}, "

        if self.scoring is not None:
            log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, "
            if self.validation_split is not None:
                log_msg += (f"{self.scoring} val: "
                            f"{self.validation_scores_[-1]:.5f}, ")

        iteration_time = time() - iteration_start_time
        log_msg += f"in {iteration_time:0.3f}s"

        print(log_msg)

    def _raw_predict(self, X, binned=False):
        """Return the sum of the leaves values over all predictors.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.
        binned : bool, optional (default=False)
            If True, X is considered to be already binned.

        Returns
        -------
        raw_predictions : array, shape (n_samples * n_trees_per_iteration,)
            The raw predicted values.
        """
        X = check_array(X)
        check_is_fitted(self, 'predictors_')
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f'X has {X.shape[1]} features but this estimator was '
                f'trained with {self.n_features_} features.'
            )
        n_samples = X.shape[0]
        raw_predictions = np.zeros(
            shape=(n_samples, self.n_trees_per_iteration_),
            dtype=np.float32
        )
        # Should we parallelize this?
        for predictors_of_ith_iteration in self.predictors_:
            for k, predictor in enumerate(predictors_of_ith_iteration):
                predict = (predictor.predict_binned if binned
                           else predictor.predict)
                raw_predictions[:, k] += predict(X)

        return raw_predictions

    @abstractmethod
    def _get_loss(self):
        pass

    @abstractmethod
    def _encode_y(self, y=None):
        pass

    @property
    def n_iter_(self):
        check_is_fitted(self, 'predictors_')
        return len(self.predictors_)

    @abstractmethod
    def _predict_binned(self, X_binned):
        pass
class BaseGradientBoostingMachine(BaseEstimator, ABC):
    """Base class for gradient boosting estimators."""

    multi_output = False
    prediction_dim = 1

    @abstractmethod
    def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
                 max_depth, min_samples_leaf, l2_regularization, max_bins,
                 scoring, validation_split, n_iter_no_change, tol, verbose,
                 random_state):
        self.loss = loss
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.l2_regularization = l2_regularization
        self.max_bins = max_bins
        self.n_iter_no_change = n_iter_no_change
        self.validation_split = validation_split
        self.scoring = scoring
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

    def _validate_parameters(self, X):
        """Validate parameters passed to __init__.

        The parameters that are directly passed to the grower are checked in
        TreeGrower."""

        if self.loss not in self._VALID_LOSSES:
            raise ValueError("Loss {} is not supported for {}. Accepted losses"
                             "are {}.".format(self.loss,
                                              self.__class__.__name__,
                                              ', '.join(self._VALID_LOSSES)))

        if self.learning_rate <= 0:
            raise ValueError(f'learning_rate={self.learning_rate} must '
                             f'be strictly positive')
        if self.max_iter < 1:
            raise ValueError(f'max_iter={self.max_iter} must '
                             f'not be smaller than 1.')
        if self.n_iter_no_change is not None and self.n_iter_no_change < 0:
            raise ValueError(f'n_iter_no_change={self.n_iter_no_change} '
                             f'must be positive.')
        if self.validation_split is not None and self.validation_split <= 0:
            raise ValueError(f'validation_split={self.validation_split} '
                             f'must be strictly positive, or None.')
        if self.tol is not None and self.tol < 0:
            raise ValueError(f'tol={self.tol} ' f'must not be smaller than 0.')
        if X.dtype == np.uint8:  # pre-binned data
            max_bin_index = X.max()
            if self.max_bins < max_bin_index + 1:
                raise ValueError(
                    f'max_bins is set to {self.max_bins} but the data is '
                    f'pre-binned with {max_bin_index + 1} bins.')

    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples. If ``X.dtype == np.uint8``, the data is
            assumed to be pre-binned and the prediction methods
            (``predict``, ``predict_proba``) will only accept pre-binned
            data as well.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        self.multi_output = len(y.ravel()) != len(y)
        if self.multi_output:
            self.prediction_dim = y.shape[1]
        else:
            self.prediction_dim = 1
        X, y = check_X_y(X,
                         y,
                         dtype=[np.float32, np.float64, np.uint8],
                         multi_output=self.multi_output)
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.')
        rng = check_random_state(self.random_state)

        self._validate_parameters(X)
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if X.dtype == np.uint8:  # data is pre-binned
            if self.verbose:
                print("X is pre-binned.")
            X_binned = X
            self.bin_mapper_ = None
            numerical_thresholds = None
            n_bins_per_feature = X.max(axis=0).astype(np.uint32)
        else:
            if self.verbose:
                print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                      end="",
                      flush=True)
            tic = time()
            self.bin_mapper_ = BinMapper(max_bins=self.max_bins,
                                         random_state=rng)
            X_binned = self.bin_mapper_.fit_transform(X)
            numerical_thresholds = self.bin_mapper_.numerical_thresholds_
            n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_
            toc = time()

            if self.verbose:
                duration = toc - tic
                throughput = X.nbytes / duration
                print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        do_early_stopping = (self.n_iter_no_change is not None
                             and self.n_iter_no_change > 0)

        if do_early_stopping and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=stratify,
                random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.')
            # Predicting is faster of C-contiguous arrays, training is faster
            # on Fortran arrays.
            X_binned_val = np.ascontiguousarray(X_binned_val)
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        if do_early_stopping:
            subsample_size = 10000
            n_samples_train = X_binned_train.shape[0]
            if n_samples_train > subsample_size:
                indices = rng.choice(X_binned_train.shape[0], subsample_size)
                X_binned_small_train = X_binned_train[indices]
                y_small_train = y_train[indices]
            else:
                X_binned_small_train = X_binned_train
                y_small_train = y_train
            # Predicting is faster of C-contiguous arrays.
            X_binned_small_train = np.ascontiguousarray(X_binned_small_train)

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        self.baseline_prediction_ = self.loss_.get_baseline_prediction(
            y_train, self.prediction_dim)
        # raw_predictions are the accumulated values predicted by the trees
        # for the training data.
        raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim),
                                   dtype=self.baseline_prediction_.dtype)
        if not self.multi_output:
            raw_predictions = raw_predictions.ravel()
        raw_predictions += self.baseline_prediction_

        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples, prediction_dim=self.prediction_dim)
        if not self.multi_output:
            gradients = gradients.ravel()
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        # scorer_ is a callable with signature (est, X, y) and calls
        # est.predict() or est.predict_proba() depending on its nature.
        self.scorer_ = check_scoring(self, self.scoring)
        self.train_scores_ = []
        self.validation_scores_ = []
        if do_early_stopping:
            # Add predictions of the initial model (before the first tree)
            self.train_scores_.append(self._get_scores(X_binned_train,
                                                       y_train))

            if self.validation_split is not None:
                self.validation_scores_.append(
                    self._get_scores(X_binned_val, y_val))

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ",
                      end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])
            if self.multi_output:
                proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians(
                    gradients, hessians)
            else:
                proj_gradients, proj_hessians = gradients.ravel(
                ), hessians.ravel()

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(
                    zip(
                        np.array_split(proj_gradients,
                                       self.n_trees_per_iteration_),
                        np.array_split(proj_hessians,
                                       self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(X_binned_train,
                                    gradients_at_k,
                                    hessians_at_k,
                                    max_bins=self.max_bins,
                                    n_bins_per_feature=n_bins_per_feature,
                                    max_leaf_nodes=self.max_leaf_nodes,
                                    max_depth=self.max_depth,
                                    min_samples_leaf=self.min_samples_leaf,
                                    l2_regularization=self.l2_regularization,
                                    shrinkage=self.learning_rate)
                grower.grow()

                if self.multi_output:
                    for l in grower.finalized_leaves:
                        l.residual = (
                            -self.learning_rate *
                            np.sum(a=gradients[l.sample_indices, :], axis=0) /
                            (l.sum_hessians + self.l2_regularization +
                             np.finfo(np.float64).eps))
                    leaves_data = [(l.residual, l.sample_indices)
                                   for l in grower.finalized_leaves]
                else:
                    leaves_data = [(l.value, l.sample_indices)
                                   for l in grower.finalized_leaves]

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(numerical_thresholds)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted

                _update_raw_predictions(leaves_data, raw_predictions)
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_early_stop = False
            if do_early_stopping:
                should_early_stop = self._check_early_stopping(
                    X_binned_small_train, y_small_train, X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time,
                                            do_early_stopping)

            if should_early_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        self.validation_scores_ = np.asarray(self.validation_scores_)
        return self

    def _check_early_stopping(self, X_binned_train, y_train, X_binned_val,
                              y_val):
        """Check if fitting should be early-stopped.

        Scores are computed on validation data or on training data.
        """

        self.train_scores_.append(self._get_scores(X_binned_train, y_train))

        if self.validation_split is not None:
            self.validation_scores_.append(
                self._get_scores(X_binned_val, y_val))
            return self._should_stop(self.validation_scores_)

        return self._should_stop(self.train_scores_)

    def _should_stop(self, scores):
        """
        Return True (do early stopping) if the last n scores aren't better
        than the (n-1)th-to-last score, up to some tolerance.
        """
        reference_position = self.n_iter_no_change + 1
        if len(scores) < reference_position:
            return False

        # A higher score is always better. Higher tol means that it will be
        # harder for subsequent iteration to be considered an improvement upon
        # the reference score, and therefore it is more likely to early stop
        # because of the lack of significant improvement.
        tol = 0 if self.tol is None else self.tol
        reference_score = scores[-reference_position] + tol
        recent_scores = scores[-reference_position + 1:]
        recent_improvements = [
            score > reference_score for score in recent_scores
        ]
        return not any(recent_improvements)

    def _get_scores(self, X, y):
        """Compute scores on data X with target y.

        Scores are either computed with a scorer if scoring parameter is not
        None, else with the loss. As higher is always better, we return
        -loss_value.
        """
        if self.scoring is not None:
            return self.scorer_(self, X, y)

        # Else, use the negative loss as score.
        if self.multi_output:
            raw_predictions = self._raw_predict_multi(X)
        else:
            raw_predictions = self._raw_predict(X)
        return -self.loss_(y, raw_predictions)

    def _print_iteration_stats(self, iteration_start_time, do_early_stopping):
        """Print info about the current fitting iteration."""
        log_msg = ''

        predictors_of_ith_iteration = [
            predictors_list for predictors_list in self.predictors_[-1]
            if predictors_list
        ]
        n_trees = len(predictors_of_ith_iteration)
        max_depth = max(predictor.get_max_depth()
                        for predictor in predictors_of_ith_iteration)
        n_leaves = sum(predictor.get_n_leaf_nodes()
                       for predictor in predictors_of_ith_iteration)

        if n_trees == 1:
            log_msg += (f"{n_trees} tree, {n_leaves} leaves, ")
        else:
            log_msg += (f"{n_trees} trees, {n_leaves} leaves ")
            log_msg += (f"({int(n_leaves / n_trees)} on avg), ")

        log_msg += f"max depth = {max_depth}, "

        if do_early_stopping:
            log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, "
            if self.validation_split is not None:
                log_msg += (f"{self.scoring} val: "
                            f"{self.validation_scores_[-1]:.5f}, ")

        iteration_time = time() - iteration_start_time
        log_msg += f"in {iteration_time:0.3f}s"

        print(log_msg)

    def _raw_predict(self, X):
        """Return the sum of the leaves values over all predictors.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples. If ``X.dtype == np.uint8``, the data is assumed
            to be pre-binned and the estimator must have been fitted with
            pre-binned data.

        Returns
        -------
        raw_predictions : array, shape (n_samples * n_trees_per_iteration,)
            The raw predicted values.
        """
        X = check_array(X)
        check_is_fitted(self, 'predictors_')
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f'X has {X.shape[1]} features but this estimator was '
                f'trained with {self.n_features_} features.')
        is_binned = X.dtype == np.uint8
        if not is_binned and self.bin_mapper_ is None:
            raise ValueError(
                'This estimator was fitted with pre-binned data and '
                'can only predict pre-binned data as well. If your data *is* '
                'already pre-binnned, convert it to uint8 using e.g. '
                'X.astype(np.uint8). If the data passed to fit() was *not* '
                'pre-binned, convert it to float32 and call fit() again.')
        n_samples = X.shape[0]
        raw_predictions = np.zeros(shape=(n_samples,
                                          self.n_trees_per_iteration_),
                                   dtype=self.baseline_prediction_.dtype)
        raw_predictions += self.baseline_prediction_
        # Should we parallelize this?
        for predictors_of_ith_iteration in self.predictors_:
            for k, predictor in enumerate(predictors_of_ith_iteration):
                predict = (predictor.predict_binned
                           if is_binned else predictor.predict)
                raw_predictions[:, k] += predict(X)

        return raw_predictions

    def _raw_predict_multi(self, X):
        """Return the sum of the leaves values over all predictors.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples. If ``X.dtype == np.uint8``, the data is assumed
            to be pre-binned and the estimator must have been fitted with
            pre-binned data.

        Returns
        -------
        raw_predictions : array, shape (n_samples * n_trees_per_iteration,)
            The raw predicted values.
        """
        X = check_array(X)
        check_is_fitted(self, 'predictors_')
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f'X has {X.shape[1]} features but this estimator was '
                f'trained with {self.n_features_} features.')
        is_binned = X.dtype == np.uint8
        if not is_binned and self.bin_mapper_ is None:
            raise ValueError(
                'This estimator was fitted with pre-binned data and '
                'can only predict pre-binned data as well. If your data *is* '
                'already pre-binnned, convert it to uint8 using e.g. '
                'X.astype(np.uint8). If the data passed to fit() was *not* '
                'pre-binned, convert it to float32 and call fit() again.')
        n_samples = X.shape[0]
        raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim),
                                   dtype=self.baseline_prediction_.dtype)
        raw_predictions += self.baseline_prediction_
        # Should we parallelize this?
        for predictors_of_ith_iteration in self.predictors_:
            for k, predictor in enumerate(predictors_of_ith_iteration):
                predict = (predictor.predict_binned_multi
                           if is_binned else predictor.predict_multi)
                tmp = predict(X, self.prediction_dim)
                if tmp.dtype != 'float32':
                    print(tmp)
                raw_predictions = np.add(raw_predictions,
                                         predict(X, self.prediction_dim))

        return raw_predictions

    def randomly_project_gradients_and_hessians(self, gradients, hessians):
        proj_g = SparseRandomProjection(
            n_components=1,
            random_state=self.random_state).fit_transform(X=gradients)
        proj_h = hessians  #SparseRandomProjection(n_components=1, random_state=self.random_state).fit_transform(X=hessians)
        return proj_g.ravel().astype(np.float32), proj_h.astype(np.float32)

    @abstractmethod
    def _get_loss(self):
        pass

    @abstractmethod
    def _encode_y(self, y=None):
        pass

    @property
    def n_iter_(self):
        check_is_fitted(self, 'predictors_')
        return len(self.predictors_)
Example #13
0
class GradientBoostingMachine(BaseEstimator, RegressorMixin):
    def __init__(self,
                 learning_rate=0.1,
                 max_iter=100,
                 max_leaf_nodes=31,
                 max_depth=None,
                 min_samples_leaf=20,
                 l2_regularization=0.,
                 max_bins=255,
                 max_no_improvement=5,
                 validation_split=0.1,
                 scoring='neg_mean_squared_error',
                 tol=1e-7,
                 verbose=0,
                 random_state=None):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.l2_regularization = l2_regularization
        self.max_bins = max_bins
        self.max_no_improvement = max_no_improvement
        self.validation_split = validation_split
        self.scoring = scoring
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

    def fit(self, X, y):
        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = y.astype(np.float32, copy=False)
        rng = check_random_state(self.random_state)
        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                  end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")
        if self.validation_split is not None:
            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=y,
                random_state=rng)
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(np.arange(X_binned_train.shape[0]),
                                 subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")
        # TODO: plug custom loss functions
        y_pred = np.zeros_like(y_train, dtype=np.float32)
        gradients = np.asarray(y_train, dtype=np.float32).copy()
        hessians = np.ones(1, dtype=np.float32)
        self.predictors_ = predictors = []
        self.train_scores_ = []
        if self.validation_split is not None:
            self.validation_scores_ = []
        scorer = check_scoring(self, self.scoring)
        gb_start_time = time()
        # TODO: compute training loss and use it for early stopping if no
        # validation data is provided?
        self.n_iter_ = 0
        while True:
            should_stop = self._stopping_criterion(gb_start_time, scorer,
                                                   X_binned_small_train,
                                                   y_small_train, X_binned_val,
                                                   y_val)
            if should_stop or self.n_iter_ == self.max_iter:
                break
            shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate
            grower = TreeGrower(X_binned_train,
                                gradients,
                                hessians,
                                n_bins=self.max_bins,
                                max_leaf_nodes=self.max_leaf_nodes,
                                max_depth=self.max_depth,
                                min_samples_leaf=self.min_samples_leaf,
                                shrinkage=shrinkage)
            grower.grow()
            predictor = grower.make_predictor(
                bin_thresholds=self.bin_mapper_.bin_thresholds_)
            predictors.append(predictor)
            self.n_iter_ += 1
            tic_pred = time()
            leaves_data = [(l.value, l.sample_indices)
                           for l in grower.finalized_leaves]
            _update_y_pred(leaves_data, y_pred)
            gradients = y_train - y_pred
            toc_pred = time()
            acc_prediction_time += toc_pred - tic_pred

            acc_apply_split_time += grower.total_apply_split_time
            acc_find_split_time += grower.total_find_split_time
        if self.verbose:
            duration = time() - fit_start_time
            n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_)
            print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, "
                  f"({n_leaf_nodes} total leaf nodes)")
            print('{:<32} {:.3f}s'.format('Time spent finding best splits:',
                                          acc_find_split_time))
            print('{:<32} {:.3f}s'.format('Time spent applying splits:',
                                          acc_apply_split_time))
            print('{:<32} {:.3f}s'.format('Time spent predicting:',
                                          acc_prediction_time))
        self.train_scores_ = np.asarray(self.train_scores_)
        if self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self

    def predict(self, X):
        # TODO: check input / check_fitted
        # TODO: make predictor behave correctly on pre-binned data
        # TODO: handle classification and output class labels in this case
        predicted = np.zeros(X.shape[0], dtype=np.float32)
        for predictor in self.predictors_:
            predicted += predictor.predict(X)
        return predicted

    def _predict_binned(self, X_binned):
        predicted = np.zeros(X_binned.shape[0], dtype=np.float32)
        for predictor in self.predictors_:
            predicted += predictor.predict_binned(X_binned)
        return predicted

    def _stopping_criterion(self, start_time, scorer, X_binned_train, y_train,
                            X_binned_val, y_val):
        log_msg = f"[{self.n_iter_}/{self.max_iter}]"

        if self.scoring is not None:
            # TODO: make sure that self.predict can work on binned data and
            # then only use the public scorer.__call__.
            predicted_train = self._predict_binned(X_binned_train)
            score_train = scorer._score_func(y_train, predicted_train)
            self.train_scores_.append(score_train)
            log_msg += f" {self.scoring} train: {score_train:.5f},"

            if self.validation_split is not None:
                predicted_val = self._predict_binned(X_binned_val)
                score_val = scorer._score_func(y_val, predicted_val)
                self.validation_scores_.append(score_val)
                log_msg += f", {self.scoring} val: {score_val:.5f},"

        if self.n_iter_ > 0:
            iteration_time = (time() - start_time) / self.n_iter_
            predictor_nodes = self.predictors_[-1].nodes
            max_depth = predictor_nodes['depth'].max()
            n_leaf_nodes = predictor_nodes['is_leaf'].sum()
            log_msg += (f" {n_leaf_nodes} leaf nodes, max depth {max_depth}"
                        f" in {iteration_time:0.3f}s")

        if self.verbose:
            print(log_msg)

        if self.validation_split is not None:
            return self._should_stop(self.validation_scores_)
        else:
            return self._should_stop(self.train_scores_)

    def _should_stop(self, scores):
        if (len(scores) == 0 or (self.max_no_improvement
                                 and len(scores) < self.max_no_improvement)):
            return False
        context_scores = scores[-self.max_no_improvement:]
        candidate = scores[-self.max_no_improvement]
        tol = 0. if self.tol is None else self.tol
        # sklearn scores: higher is always better.
        best_with_tol = max(context_scores) * (1 - tol)
        return candidate >= best_with_tol