def _fit_stages(self, X, y, y_pred, random_state, begin_at_stage, n_shop): n_samples = len(n_shop) do_oob = self.subsample < 1.0 sample_mask = np.ones((n_samples, ), dtype=np.bool) n_inbag = max(1, int(self.subsample * n_samples)) loss_ = self.loss_ # init criterion and splitter criterion = FriedmanMSE(1) splitter = PresortBestSplitter(criterion, self.max_features_, self.min_samples_leaf, random_state) if self.verbose: verbose_reporter = VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage) # perform boosting iterations for i in range(begin_at_stage, self.n_estimators): # subsampling # first randomly truncate the shopping history if not self.fix_history: trunc_idx = get_truncated_shopping_indices(n_shop) X_trunc = X[trunc_idx] else: X_trunc = X sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) # OOB score before adding this stage old_oob_score = loss_(y[~sample_mask], y_pred[~sample_mask]) # fit next stage of trees y_pred = self._fit_stage(i, X_trunc, y, y_pred, sample_mask, criterion, splitter, random_state) # track deviance (= loss) if do_oob: self.train_score_[i] = loss_(y[sample_mask], y_pred[sample_mask]) self._oob_score_[i] = loss_(y[~sample_mask], y_pred[~sample_mask]) self.oob_improvement_[i] = old_oob_score - self._oob_score_[i] else: # no need to fancy index w/ no subsampling self.train_score_[i] = loss_(y, y_pred) if self.verbose > 0: verbose_reporter.update(i, self) return i + 1
def _fit_stages(self, X, y, y_pred, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage=0, monitor=None, X_idx_sorted=None): """Iteratively fits the stages. For each stage it computes the progress (OOB, train score) and delegates to ``_fit_stage``. Returns the number of stages fit; might differ from ``n_estimators`` due to early stopping. """ n_samples = X.shape[0] do_oob = self.subsample < 1.0 sample_mask = np.ones((n_samples, ), dtype=np.bool) n_inbag = max(1, int(self.subsample * n_samples)) loss_ = self.loss_ # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. if self.verbose: verbose_reporter = VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage) X_csc = csc_matrix(X) if issparse(X) else None X_csr = csr_matrix(X) if issparse(X) else None if self.n_iter_no_change is not None: loss_history = np.ones(self.n_iter_no_change) * np.inf # We create a generator to get the predictions for X_val after # the addition of each successive stage y_val_pred_iter = self._staged_decision_function(X_val) # perform boosting iterations i = begin_at_stage for i in range(begin_at_stage, self.n_estimators): # subsampling if do_oob: sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) # OOB score before adding this stage old_oob_score = loss_(y[~sample_mask], y_pred[~sample_mask], sample_weight[~sample_mask]) # fit next stage of trees y_pred = self._fit_stage(i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc, X_csr) # track deviance (= loss) if do_oob: self.train_score_[i] = loss_(y[sample_mask], y_pred[sample_mask], sample_weight[sample_mask]) self.oob_improvement_[i] = ( old_oob_score - loss_(y[~sample_mask], y_pred[~sample_mask], sample_weight[~sample_mask])) else: # no need to fancy index w/ no subsampling self.train_score_[i] = loss_(y, y_pred, sample_weight) if self.verbose > 0: verbose_reporter.update(i, self) if monitor is not None: early_stopping = monitor(i, self, locals()) if early_stopping: break # We also provide an early stopping based on the score from # validation set (X_val, y_val), if n_iter_no_change is set if self.n_iter_no_change is not None: # By calling next(y_val_pred_iter), we get the predictions # for X_val after the addition of the current stage validation_loss = loss_(y_val, next(y_val_pred_iter), sample_weight_val) # Require validation_score to be better (less) than at least # one of the last n_iter_no_change evaluations if np.any(validation_loss + self.tol < loss_history): loss_history[i % len(loss_history)] = validation_loss else: break return i + 1
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers)) self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, begin_at_stage=0, monitor=None, X_idx_sorted=None): """Iteratively fits the stages. For each stage it computes the progress (OOB, train score) and delegates to ``_fit_stage``. Returns the number of stages fit; might differ from ``n_estimators`` due to early stopping. """ n_samples = X.shape[0] do_oob = self.subsample < 1.0 sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool) n_inbag = max(1, int(self.subsample * n_samples)) loss_ = self.loss_ if self.verbose: verbose_reporter = VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage) X_csc = csc_matrix(X) if issparse(X) else None X_csr = csr_matrix(X) if issparse(X) else None if self.dropout_rate > 0.: scale = numpy.ones(self.n_estimators, dtype=float) else: scale = None # perform boosting iterations i = begin_at_stage for i in range(begin_at_stage, self.n_estimators): # subsampling if do_oob: sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) # OOB score before adding this stage y_oob_sample = y[~sample_mask] old_oob_score = loss_(y_oob_sample, raw_predictions[~sample_mask], sample_weight[~sample_mask]) # fit next stage of trees raw_predictions = self._fit_stage(i, X, y, raw_predictions, sample_weight, sample_mask, random_state, scale, X_idx_sorted, X_csc, X_csr) # track deviance (= loss) if do_oob: self.train_score_[i] = loss_(y[sample_mask], raw_predictions[sample_mask], sample_weight[sample_mask]) self.oob_improvement_[i] = ( old_oob_score - loss_(y_oob_sample, raw_predictions[~sample_mask], sample_weight[~sample_mask])) else: # no need to fancy index w/ no subsampling self.train_score_[i] = loss_(y, raw_predictions, sample_weight) if self.verbose > 0: verbose_reporter.update(i, self) if monitor is not None: early_stopping = monitor(i, self, locals()) if early_stopping: break if self.dropout_rate > 0.: self.scale_ = scale return i + 1
def _fit(self, X, event, time, sample_weight, random_state): # noqa: C901 n_samples = X.shape[0] # account for intercept Xi = numpy.column_stack((numpy.ones(n_samples), X)) y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) y_pred = numpy.zeros(n_samples) do_oob = self.subsample < 1.0 if do_oob: n_inbag = max(1, int(self.subsample * n_samples)) do_dropout = self.dropout_rate > 0 if do_dropout: scale = numpy.ones(int(self.n_estimators), dtype=float) if self.verbose: verbose_reporter = VerboseReporter(self.verbose) verbose_reporter.init(self, 0) for num_iter in range(int(self.n_estimators)): if do_oob: sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) subsample_weight = sample_weight * sample_mask.astype( numpy.float64) # OOB score before adding this stage old_oob_score = self.loss_(y[~sample_mask], y_pred[~sample_mask], sample_weight[~sample_mask]) else: subsample_weight = sample_weight residuals = self.loss_.negative_gradient( y, y_pred, sample_weight=sample_weight) best_learner = _fit_stage_componentwise(Xi, residuals, subsample_weight) self.estimators_.append(best_learner) if do_dropout: drop_model, n_dropped = _sample_binomial_plus_one( self.dropout_rate, num_iter + 1, random_state) scale[num_iter] = 1. / (n_dropped + 1.) y_pred[:] = 0 for m in range(num_iter + 1): if drop_model[m] == 1: scale[m] *= n_dropped / (n_dropped + 1.) else: y_pred += self.learning_rate * scale[ m] * self.estimators_[m].predict(Xi) else: y_pred += self.learning_rate * best_learner.predict(Xi) # track deviance (= loss) if do_oob: self.train_score_[num_iter] = self.loss_( y[sample_mask], y_pred[sample_mask], sample_weight[sample_mask]) self.oob_improvement_[num_iter] = ( old_oob_score - self.loss_(y[~sample_mask], y_pred[~sample_mask], sample_weight[~sample_mask])) else: # no need to fancy index w/ no subsampling self.train_score_[num_iter] = self.loss_( y, y_pred, sample_weight) if self.verbose > 0: verbose_reporter.update(num_iter, self)
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight( y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum( cl.predict(X) * rate for rate, cl in zip( self.learning_rates, self.classifiers)) self.loss_values.append( self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones( len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones( len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def _fit_stages(self, X, y, y_pred, sample_weight, random_state, begin_at_stage=0, monitor=None): """Iteratively fits the stages. For each stage it computes the progress (OOB, train score) and delegates to ``_fit_stage``. Returns the number of stages fit; might differ from ``n_estimators`` due to early stopping. """ n_samples = X.shape[0] do_oob = self.subsample < 1.0 sample_mask = numpy.ones((n_samples, ), dtype=numpy.bool) n_inbag = max(1, int(self.subsample * n_samples)) loss_ = self.loss_ # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * numpy.sum(sample_weight)) else: min_weight_leaf = 0. # init criterion and splitter criterion = FriedmanMSE(1) splitter = PresortBestSplitter(criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) if self.verbose: verbose_reporter = VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage) if self.dropout_rate > 0.: scale = numpy.ones(self.n_estimators, dtype=float) else: scale = None # perform boosting iterations i = begin_at_stage for i in range(begin_at_stage, self.n_estimators): # subsampling if do_oob: sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) # OOB score before adding this stage y_oob_sample = y[~sample_mask] old_oob_score = loss_(y_oob_sample, y_pred[~sample_mask], sample_weight[~sample_mask]) # fit next stage of trees y_pred = self._fit_stage(i, X, y, y_pred, sample_weight, sample_mask, criterion, splitter, random_state, scale) # track deviance (= loss) if do_oob: self.train_score_[i] = loss_(y[sample_mask], y_pred[sample_mask], sample_weight[sample_mask]) self.oob_improvement_[i] = ( old_oob_score - loss_(y_oob_sample, y_pred[~sample_mask], sample_weight[~sample_mask])) else: # no need to fancy index w/ no subsampling self.train_score_[i] = loss_(y, y_pred, sample_weight) if self.verbose > 0: verbose_reporter.update(i, self) if monitor is not None: early_stopping = monitor(i, self, locals()) if early_stopping: break return i + 1
def _random_sample_mask(self, n_samples, n_inbag, random_state, **kargs): return _random_sample_mask(n_samples, n_inbag, random_state)