def _fit_stages(self, X, y, y_pred, random_state, begin_at_stage, n_shop):
        n_samples = len(n_shop)
        do_oob = self.subsample < 1.0
        sample_mask = np.ones((n_samples, ), dtype=np.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        # init criterion and splitter
        criterion = FriedmanMSE(1)
        splitter = PresortBestSplitter(criterion,
                                       self.max_features_,
                                       self.min_samples_leaf,
                                       random_state)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        # perform boosting iterations
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            # first randomly truncate the shopping history
            if not self.fix_history:
                trunc_idx = get_truncated_shopping_indices(n_shop)
                X_trunc = X[trunc_idx]
            else:
                X_trunc = X

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              random_state)
            # OOB score before adding this stage
            old_oob_score = loss_(y[~sample_mask],
                                  y_pred[~sample_mask])

            # fit next stage of trees
            y_pred = self._fit_stage(i, X_trunc, y, y_pred, sample_mask,
                                     criterion, splitter, random_state)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             y_pred[sample_mask])
                self._oob_score_[i] = loss_(y[~sample_mask],
                                            y_pred[~sample_mask])
                self.oob_improvement_[i] = old_oob_score - self._oob_score_[i]
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, y_pred)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

        return i + 1
    def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
                    X_val, y_val, sample_weight_val,
                    begin_at_stage=0, monitor=None, X_idx_sorted=None):
        """Iteratively fits the stages.
        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = np.ones((n_samples, ), dtype=np.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        # Set min_weight_leaf from min_weight_fraction_leaf
        if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               np.sum(sample_weight))
        else:
            min_weight_leaf = 0.

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        X_csc = csc_matrix(X) if issparse(X) else None
        X_csr = csr_matrix(X) if issparse(X) else None

        if self.n_iter_no_change is not None:
            loss_history = np.ones(self.n_iter_no_change) * np.inf
            # We create a generator to get the predictions for X_val after
            # the addition of each successive stage
            y_val_pred_iter = self._staged_decision_function(X_val)

        # perform boosting iterations
        i = begin_at_stage
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                # OOB score before adding this stage
                old_oob_score = loss_(y[~sample_mask],
                                      y_pred[~sample_mask],
                                      sample_weight[~sample_mask])

            # fit next stage of trees
            y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
                                     sample_mask, random_state, X_idx_sorted,
                                     X_csc, X_csr)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             y_pred[sample_mask],
                                             sample_weight[sample_mask])
                self.oob_improvement_[i] = (
                    old_oob_score - loss_(y[~sample_mask],
                                          y_pred[~sample_mask],
                                          sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, y_pred, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break

            # We also provide an early stopping based on the score from
            # validation set (X_val, y_val), if n_iter_no_change is set
            if self.n_iter_no_change is not None:
                # By calling next(y_val_pred_iter), we get the predictions
                # for X_val after the addition of the current stage
                validation_loss = loss_(y_val, next(y_val_pred_iter),
                                        sample_weight_val)

                # Require validation_score to be better (less) than at least
                # one of the last n_iter_no_change evaluations
                if np.any(validation_loss + self.tol < loss_history):
                    loss_history[i % len(loss_history)] = validation_loss
                else:
                    break

        return i + 1
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration, self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers))


            self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration, self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight))
            residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask], check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
                self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred,
                                                  sample_mask=update_mask, sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Esempio n. 4
0
    def _fit_stages(self,
                    X,
                    y,
                    raw_predictions,
                    sample_weight,
                    random_state,
                    begin_at_stage=0,
                    monitor=None,
                    X_idx_sorted=None):
        """Iteratively fits the stages.

        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        X_csc = csc_matrix(X) if issparse(X) else None
        X_csr = csr_matrix(X) if issparse(X) else None

        if self.dropout_rate > 0.:
            scale = numpy.ones(self.n_estimators, dtype=float)
        else:
            scale = None

        # perform boosting iterations
        i = begin_at_stage
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                # OOB score before adding this stage
                y_oob_sample = y[~sample_mask]
                old_oob_score = loss_(y_oob_sample,
                                      raw_predictions[~sample_mask],
                                      sample_weight[~sample_mask])

            # fit next stage of trees
            raw_predictions = self._fit_stage(i, X, y, raw_predictions,
                                              sample_weight, sample_mask,
                                              random_state, scale,
                                              X_idx_sorted, X_csc, X_csr)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             raw_predictions[sample_mask],
                                             sample_weight[sample_mask])
                self.oob_improvement_[i] = (
                    old_oob_score -
                    loss_(y_oob_sample, raw_predictions[~sample_mask],
                          sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, raw_predictions, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break

        if self.dropout_rate > 0.:
            self.scale_ = scale

        return i + 1
Esempio n. 5
0
    def _fit(self, X, event, time, sample_weight, random_state):  # noqa: C901
        n_samples = X.shape[0]
        # account for intercept
        Xi = numpy.column_stack((numpy.ones(n_samples), X))
        y = numpy.fromiter(zip(event, time),
                           dtype=[('event', numpy.bool),
                                  ('time', numpy.float64)])
        y_pred = numpy.zeros(n_samples)

        do_oob = self.subsample < 1.0
        if do_oob:
            n_inbag = max(1, int(self.subsample * n_samples))

        do_dropout = self.dropout_rate > 0
        if do_dropout:
            scale = numpy.ones(int(self.n_estimators), dtype=float)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, 0)

        for num_iter in range(int(self.n_estimators)):
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                subsample_weight = sample_weight * sample_mask.astype(
                    numpy.float64)

                # OOB score before adding this stage
                old_oob_score = self.loss_(y[~sample_mask],
                                           y_pred[~sample_mask],
                                           sample_weight[~sample_mask])
            else:
                subsample_weight = sample_weight

            residuals = self.loss_.negative_gradient(
                y, y_pred, sample_weight=sample_weight)

            best_learner = _fit_stage_componentwise(Xi, residuals,
                                                    subsample_weight)
            self.estimators_.append(best_learner)

            if do_dropout:
                drop_model, n_dropped = _sample_binomial_plus_one(
                    self.dropout_rate, num_iter + 1, random_state)

                scale[num_iter] = 1. / (n_dropped + 1.)

                y_pred[:] = 0
                for m in range(num_iter + 1):
                    if drop_model[m] == 1:
                        scale[m] *= n_dropped / (n_dropped + 1.)
                    else:
                        y_pred += self.learning_rate * scale[
                            m] * self.estimators_[m].predict(Xi)
            else:
                y_pred += self.learning_rate * best_learner.predict(Xi)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[num_iter] = self.loss_(
                    y[sample_mask], y_pred[sample_mask],
                    sample_weight[sample_mask])
                self.oob_improvement_[num_iter] = (
                    old_oob_score -
                    self.loss_(y[~sample_mask], y_pred[~sample_mask],
                               sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[num_iter] = self.loss_(
                    y, y_pred, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(num_iter, self)
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X,
                            y,
                            dtype=DTYPE,
                            sparse_format="dense",
                            check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(
            y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration,
                                                self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X,
                                           dtype=DTYPE,
                                           sparse_format="dense",
                                           check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(
                        cl.predict(X) * rate for rate, cl in zip(
                            self.learning_rates, self.classifiers))

            self.loss_values.append(
                self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration,
                                      self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(
                len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(
                len(sample_weight))
            residual = self.loss.negative_gradient(y,
                                                   y_pred,
                                                   sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask],
                     check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag,
                                                      self.random_state)
                self.loss.update_terminal_regions(tree.tree_,
                                                  X=iter_X,
                                                  y=y,
                                                  residual=residual,
                                                  pred=y_pred,
                                                  sample_mask=update_mask,
                                                  sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Esempio n. 7
0
    def _fit_stages(self,
                    X,
                    y,
                    y_pred,
                    sample_weight,
                    random_state,
                    begin_at_stage=0,
                    monitor=None):
        """Iteratively fits the stages.

        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        # Set min_weight_leaf from min_weight_fraction_leaf
        if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               numpy.sum(sample_weight))
        else:
            min_weight_leaf = 0.

        # init criterion and splitter
        criterion = FriedmanMSE(1)
        splitter = PresortBestSplitter(criterion, self.max_features_,
                                       self.min_samples_leaf, min_weight_leaf,
                                       random_state)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        if self.dropout_rate > 0.:
            scale = numpy.ones(self.n_estimators, dtype=float)
        else:
            scale = None

        # perform boosting iterations
        i = begin_at_stage
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                # OOB score before adding this stage
                y_oob_sample = y[~sample_mask]
                old_oob_score = loss_(y_oob_sample, y_pred[~sample_mask],
                                      sample_weight[~sample_mask])

            # fit next stage of trees
            y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
                                     sample_mask, criterion, splitter,
                                     random_state, scale)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             y_pred[sample_mask],
                                             sample_weight[sample_mask])
                self.oob_improvement_[i] = (
                    old_oob_score - loss_(y_oob_sample, y_pred[~sample_mask],
                                          sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, y_pred, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break
        return i + 1
Esempio n. 8
0
 def _random_sample_mask(self, n_samples, n_inbag, random_state, **kargs):
     return _random_sample_mask(n_samples, n_inbag, random_state)