Ejemplo n.º 1
0
    def _fit_stages(self, X, y, y_pred, random_state, begin_at_stage=0,
                    monitor=None, **kargs):
        """Iteratively fits the stages.
        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = np.ones((n_samples, ), dtype=np.bool)
        n_inbag = self._inbag_samples(n_samples, **kargs)
        loss_ = self.loss_


        # init criterion and splitter
        criterion = FriedmanMSE(1)
        splitter = PresortBestSplitter(criterion,
                                       self.max_features_,
                                       self.min_samples_leaf,
                                       0,#min_weight_leaf
                                       random_state)
        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        # perform boosting iterations
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = self._random_sample_mask(n_samples, n_inbag,
                                                       random_state, **kargs)
                # OOB score before adding this stage
                old_oob_score = loss_(y, y_pred, mask=~sample_mask, **kargs)

            # fit next stage of trees
            y_pred = self._fit_stage(i, X, y, y_pred, sample_mask,
                                     criterion, splitter, random_state, **kargs)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y, y_pred, mask=sample_mask,
                                             **kargs)
                self._oob_score_[i] = loss_(y, y_pred, mask=~sample_mask,
                                            **kargs)
                self.oob_improvement_[i] = old_oob_score - self._oob_score_[i]
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = self.loss_(y, y_pred, **kargs)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break
        return i + 1
    def _fit_stages(self, X, y, y_pred, random_state, begin_at_stage, n_shop):
        n_samples = len(n_shop)
        do_oob = self.subsample < 1.0
        sample_mask = np.ones((n_samples, ), dtype=np.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        # init criterion and splitter
        criterion = FriedmanMSE(1)
        splitter = PresortBestSplitter(criterion,
                                       self.max_features_,
                                       self.min_samples_leaf,
                                       random_state)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        # perform boosting iterations
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            # first randomly truncate the shopping history
            if not self.fix_history:
                trunc_idx = get_truncated_shopping_indices(n_shop)
                X_trunc = X[trunc_idx]
            else:
                X_trunc = X

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              random_state)
            # OOB score before adding this stage
            old_oob_score = loss_(y[~sample_mask],
                                  y_pred[~sample_mask])

            # fit next stage of trees
            y_pred = self._fit_stage(i, X_trunc, y, y_pred, sample_mask,
                                     criterion, splitter, random_state)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             y_pred[sample_mask])
                self._oob_score_[i] = loss_(y[~sample_mask],
                                            y_pred[~sample_mask])
                self.oob_improvement_[i] = old_oob_score - self._oob_score_[i]
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, y_pred)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

        return i + 1
Ejemplo n.º 3
0
    def _fit_stages(self,
                    X,
                    y,
                    raw_predictions,
                    sample_weight,
                    random_state,
                    begin_at_stage=0,
                    monitor=None,
                    X_idx_sorted=None):
        """Iteratively fits the stages.

        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        X_csc = csc_matrix(X) if issparse(X) else None
        X_csr = csr_matrix(X) if issparse(X) else None

        if self.dropout_rate > 0.:
            scale = numpy.ones(self.n_estimators, dtype=float)
        else:
            scale = None

        # perform boosting iterations
        i = begin_at_stage
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                # OOB score before adding this stage
                y_oob_sample = y[~sample_mask]
                old_oob_score = loss_(y_oob_sample,
                                      raw_predictions[~sample_mask],
                                      sample_weight[~sample_mask])

            # fit next stage of trees
            raw_predictions = self._fit_stage(i, X, y, raw_predictions,
                                              sample_weight, sample_mask,
                                              random_state, scale,
                                              X_idx_sorted, X_csc, X_csr)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             raw_predictions[sample_mask],
                                             sample_weight[sample_mask])
                self.oob_improvement_[i] = (
                    old_oob_score -
                    loss_(y_oob_sample, raw_predictions[~sample_mask],
                          sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, raw_predictions, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break

        if self.dropout_rate > 0.:
            self.scale_ = scale

        return i + 1
Ejemplo n.º 4
0
    def _fit(self, X, event, time, sample_weight, random_state):  # noqa: C901
        n_samples = X.shape[0]
        # account for intercept
        Xi = numpy.column_stack((numpy.ones(n_samples), X))
        y = numpy.fromiter(zip(event, time),
                           dtype=[('event', numpy.bool),
                                  ('time', numpy.float64)])
        y_pred = numpy.zeros(n_samples)

        do_oob = self.subsample < 1.0
        if do_oob:
            n_inbag = max(1, int(self.subsample * n_samples))

        do_dropout = self.dropout_rate > 0
        if do_dropout:
            scale = numpy.ones(int(self.n_estimators), dtype=float)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, 0)

        for num_iter in range(int(self.n_estimators)):
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                subsample_weight = sample_weight * sample_mask.astype(
                    numpy.float64)

                # OOB score before adding this stage
                old_oob_score = self.loss_(y[~sample_mask],
                                           y_pred[~sample_mask],
                                           sample_weight[~sample_mask])
            else:
                subsample_weight = sample_weight

            residuals = self.loss_.negative_gradient(
                y, y_pred, sample_weight=sample_weight)

            best_learner = _fit_stage_componentwise(Xi, residuals,
                                                    subsample_weight)
            self.estimators_.append(best_learner)

            if do_dropout:
                drop_model, n_dropped = _sample_binomial_plus_one(
                    self.dropout_rate, num_iter + 1, random_state)

                scale[num_iter] = 1. / (n_dropped + 1.)

                y_pred[:] = 0
                for m in range(num_iter + 1):
                    if drop_model[m] == 1:
                        scale[m] *= n_dropped / (n_dropped + 1.)
                    else:
                        y_pred += self.learning_rate * scale[
                            m] * self.estimators_[m].predict(Xi)
            else:
                y_pred += self.learning_rate * best_learner.predict(Xi)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[num_iter] = self.loss_(
                    y[sample_mask], y_pred[sample_mask],
                    sample_weight[sample_mask])
                self.oob_improvement_[num_iter] = (
                    old_oob_score -
                    self.loss_(y[~sample_mask], y_pred[~sample_mask],
                               sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[num_iter] = self.loss_(
                    y, y_pred, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(num_iter, self)
Ejemplo n.º 5
0
    def _fit_stages(self,
                    X,
                    y,
                    y_pred,
                    sample_weight,
                    random_state,
                    begin_at_stage=0,
                    monitor=None):
        """Iteratively fits the stages.

        For each stage it computes the progress (OOB, train score)
        and delegates to ``_fit_stage``.
        Returns the number of stages fit; might differ from ``n_estimators``
        due to early stopping.
        """
        n_samples = X.shape[0]
        do_oob = self.subsample < 1.0
        sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool)
        n_inbag = max(1, int(self.subsample * n_samples))
        loss_ = self.loss_

        # Set min_weight_leaf from min_weight_fraction_leaf
        if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               numpy.sum(sample_weight))
        else:
            min_weight_leaf = 0.

        # init criterion and splitter
        criterion = FriedmanMSE(1)
        splitter = PresortBestSplitter(criterion, self.max_features_,
                                       self.min_samples_leaf, min_weight_leaf,
                                       random_state)

        if self.verbose:
            verbose_reporter = VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage)

        if self.dropout_rate > 0.:
            scale = numpy.ones(self.n_estimators, dtype=float)
        else:
            scale = None

        # perform boosting iterations
        i = begin_at_stage
        for i in range(begin_at_stage, self.n_estimators):

            # subsampling
            if do_oob:
                sample_mask = _random_sample_mask(n_samples, n_inbag,
                                                  random_state)
                # OOB score before adding this stage
                y_oob_sample = y[~sample_mask]
                old_oob_score = loss_(y_oob_sample, y_pred[~sample_mask],
                                      sample_weight[~sample_mask])

            # fit next stage of trees
            y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
                                     sample_mask, criterion, splitter,
                                     random_state, scale)

            # track deviance (= loss)
            if do_oob:
                self.train_score_[i] = loss_(y[sample_mask],
                                             y_pred[sample_mask],
                                             sample_weight[sample_mask])
                self.oob_improvement_[i] = (
                    old_oob_score - loss_(y_oob_sample, y_pred[~sample_mask],
                                          sample_weight[~sample_mask]))
            else:
                # no need to fancy index w/ no subsampling
                self.train_score_[i] = loss_(y, y_pred, sample_weight)

            if self.verbose > 0:
                verbose_reporter.update(i, self)

            if monitor is not None:
                early_stopping = monitor(i, self, locals())
                if early_stopping:
                    break
        return i + 1