def calc_outofbag(n_samples, rf): """ Recovers samples used to create trees in scikit-learn RandomForest objects. See https://github.com/scikit-learn-contrib/forest-confidence-interval Parameters ---------- n_samples : int The number of samples used to fit the scikit-learn RandomForest object. forest : RandomForest Regressor or Classifier object that is already fit by scikit-learn. Returns ------- sample_idx: list The indices of the samples used to train each tree. """ assert rf.bootstrap == True, "Forest was not trained with bootstrapping." n_trees = rf.n_estimators sample_idx = [] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, rf.max_samples) for t_idx in range(n_trees): sample_idx.append( _generate_unsampled_indices(rf.estimators_[t_idx].random_state, n_samples, n_samples_bootstrap)) return sample_idx
def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" X = check_array(X, dtype=DTYPE) n_samples = X.shape[0] event, time = y predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict(X[unsampled_indices, :], check_input=False) predictions[unsampled_indices] += p_estimator n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions self.oob_score_ = concordance_index_censored(event, time, predictions)[0]
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" X = check_array(X, dtype=DTYPE, accept_sparse="csr") n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap) p_estimator = estimator.predict_proba( X_resample[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): indices = sampler.sample_indices_[unsampled_indices] predictions[k][indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") with np.errstate(invalid="ignore", divide="ignore"): # with the resampling, we are likely to have rows not included # for the OOB score leading to division by zero decision = predictions[k] / predictions[k].sum( axis=1)[:, np.newaxis] mask_scores = np.isnan(np.sum(decision, axis=1)) oob_decision_function.append(decision) oob_score += np.mean( y[~mask_scores, k] == np.argmax(predictions[k][~mask_scores], axis=1), axis=0, ) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def _set_oob_score(self, X, y): """Compute out-of-bag score""" X = check_array(X, dtype=DTYPE, accept_sparse='csr') if self.n_classes_[0] > 2: n_classes_ = list(np.asarray(self.n_classes_) - 1) # CHANGED TO K-1 else: n_classes_ = self.n_classes_ n_samples = y.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) oob_decision_function = [] oob_score = 0.0 predictions = [] for k in range(self.n_outputs_): predictions.append(np.zeros((n_samples, n_classes_[k]))) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_cum_proba(X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) if self.n_classes_[0] <= 2: oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) else: class_index = np.sum((predictions[k] > 0.5).astype(np.int), axis=1) oob_score += np.mean(y[:, k] == class_index, axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def _get_unsampled_indices(tree, n_samples): """ An interface to get unsampled indices regardless of sklearn version. """ if LooseVersion(sklearn.__version__) >= LooseVersion("0.24"): # Version 0.24 moved forest package name from sklearn.ensemble._forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) elif LooseVersion(sklearn.__version__) >= LooseVersion("0.22"): # Version 0.22 or newer uses 3 arguments. from sklearn.ensemble.forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) else: # Version 0.21 or older uses only two arguments. return _generate_unsampled_indices(tree.random_state, n_samples)
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis] oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def _set_oob_score(self, X, y): """ Compute out-of-bag scores.""" X, y = check_X_y(X, y, enforce_univariate=True) n_samples = y.shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples ) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions if self.n_outputs_ == 1: self.oob_prediction_ = \ self.oob_prediction_.reshape((n_samples, )) self.oob_score_ = 0.0 for k in range(self.n_outputs_): self.oob_score_ += r2_score(y[:, k], predictions[:, k]) self.oob_score_ /= self.n_outputs_
def _get_oof_pred_proba(self, X, y, **kwargs): if self._daal: raise AssertionError( 'DAAL forest backend does not support out-of-bag predictions.') if not self.model.bootstrap: raise ValueError( 'Forest models must set `bootstrap=True` to compute out-of-fold predictions via out-of-bag predictions.' ) # TODO: This can also be done via setting `oob_score=True` in model params, # but getting the correct `pred_time_val` that way is not easy, since we can't time the internal call. if (getattr(self.model, "oob_decision_function_", None) is None and getattr(self.model, "oob_prediction_", None) is None) \ and callable(getattr(self.model, "_set_oob_score", None)): X = self.preprocess(X) if getattr(self.model, "n_classes_", None) is not None: if self.model.n_outputs_ == 1: self.model.n_classes_ = [self.model.n_classes_] from sklearn.tree._tree import DTYPE, DOUBLE X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) self.model._set_oob_score(X, y) if getattr(self.model, "n_classes_", None) is not None: if self.model.n_outputs_ == 1: self.model.n_classes_ = self.model.n_classes_[0] if getattr(self.model, "oob_decision_function_", None) is not None: y_oof_pred_proba = self.model.oob_decision_function_ self.model.oob_decision_function_ = None # save memory elif getattr(self.model, "oob_prediction_", None) is not None: y_oof_pred_proba = self.model.oob_prediction_ self.model.oob_prediction_ = None # save memory else: raise AssertionError( f'Model class {type(self.model)} does not support out-of-fold prediction generation.' ) # TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard. # The below code corrects the missing rows to NaN instead of 0. # Don't bother if >60 trees, near impossible to have missing # If using 68% of data for training, chance of missing for each row is 1 in 11 billion. if self.problem_type == REGRESSION and self.model.n_estimators <= 60: from sklearn.ensemble._forest import _get_n_samples_bootstrap, _generate_unsampled_indices n_samples = len(y) n_predictions = np.zeros(n_samples) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.model.max_samples) for estimator in self.model.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) n_predictions[unsampled_indices] += 1 missing_row_mask = n_predictions == 0 y_oof_pred_proba[missing_row_mask] = np.nan # fill missing prediction rows with average of non-missing rows if np.isnan(np.sum(y_oof_pred_proba)): if len(y_oof_pred_proba.shape) == 1: col_mean = np.nanmean(y_oof_pred_proba) y_oof_pred_proba[np.isnan(y_oof_pred_proba)] = col_mean else: col_mean = np.nanmean(y_oof_pred_proba, axis=0) inds = np.where(np.isnan(y_oof_pred_proba)) y_oof_pred_proba[inds] = np.take(col_mean, inds[1]) return self._convert_proba_to_unified_form(y_oof_pred_proba)
def _compute_oob_predictions(self, X, y): """Compute and set the OOB score. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. Returns ------- oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ (n_samples, 1, n_outputs) The OOB predictions. """ # Prediction requires X to be in CSR format if issparse(X): X = X.tocsr() n_samples = y.shape[0] n_outputs = self.n_outputs_ if is_classifier(self) and hasattr(self, "n_classes_"): # n_classes_ is a ndarray at this stage # all the supported type of target will have the same number of # classes in all outputs oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs) else: # for regression, n_classes_ does not exist and we create an empty # axis to be consistent with the classification case and make # the array operations compatible with the 2 settings oob_pred_shape = (n_samples, 1, n_outputs) oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap) y_pred = self._get_oob_predictions( estimator, X_resample[unsampled_indices, :]) indices = sampler.sample_indices_[unsampled_indices] oob_pred[indices, ...] += y_pred n_oob_pred[indices, :] += 1 for k in range(n_outputs): if (n_oob_pred == 0).any(): warn( "Some inputs do not have OOB scores. This probably means " "too few trees were used to compute any reliable OOB " "estimates.", UserWarning, ) n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] return oob_pred
def get_oob_indices(tree, n_samples): n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices( tree.random_state, n_samples, n_samples_bootstrap)