def estimators_samples_(self): sample_masks = [] for _, sample_indices in self._get_estimators_indices(): mask = indices_to_mask(sample_indices, self._n_samples) sample_masks.append(mask) return sample_masks
def _set_oob_score(self, X, y): n_samples = y.shape[0] predictions = np.zeros((n_samples, )) n_predictions = np.zeros((n_samples, )) for estimator, samples, features in zip( self.estimators_, self.sequentially_bootstrapped_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~indices_to_mask(samples, n_samples) predictions[mask] += estimator.predict((X[mask, :])[:, features]) n_predictions[mask] += 1 if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few estimators were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions self.oob_score_ = r2_score(y, predictions)
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) # Draw samples, using sample weights, and then fit support_sample_weight=False if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 1.E-6 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) # Draw samples, using a mask, and then fit else: estimator.fit((X[indices])[:, features], y[indices]) estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) iP = [pair[0] for pair in enumerate(y) if pair[1] == 1] iU = [pair[0] for pair in enumerate(y) if pair[1] < 1] features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, len(iU), max_features, max_samples) indices = [iU[i] for i in indices] + iP if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) else: estimator.fit((X[indices])[:, features], y[indices]) estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def _predict_score_single_estimator(estimator, X, features, samples, split, n_samples): # Create mask for OOB samples samples = indices_to_mask(samples, n_samples) unsampled = ~samples predictions = np.empty(n_samples, dtype=np.int8) predictions[unsampled] = estimator.predict(X[np.ix_(unsampled, features)]) oob_score = accuracy_score(split[unsampled], predictions[unsampled]) return predictions, oob_score
def _masked_bagging_indices(random_state, bootstrap_features, bootstrap_samples, n_features, n_samples, max_features, max_samples): """Monkey-patch to always get a mask instead of indices""" feature_indices, sample_indices = old_generate(random_state, bootstrap_features, bootstrap_samples, n_features, n_samples, max_features, max_samples) sample_indices = indices_to_mask(sample_indices, n_samples) return feature_indices, sample_indices
def estimators_samples_(self): """The subset of drawn samples for each base estimator. Returns a dynamically generated list of boolean masks identifying the samples used for fitting each member of the ensemble, i.e., the in-bag samples. Note: the list is re-created at each call to the property in order to reduce the object memory footprint by not storing the sampling data. Thus fetching the property may be slower than expected. """ sample_masks = [] for _, sample_indices in self._get_estimators_indices(): mask = indices_to_mask(sample_indices, self._n_samples) sample_masks.append(mask) return sample_masks
def score_precision_recall(X, y, rules: List[List[str]], samples: List[List[int]], features: List[List[int]], feature_names: List[str], oob: bool = True) -> List[Rule]: scored_rules = [] for curr_rules, curr_samples, curr_features in zip(rules, samples, features): # Create mask for OOB samples mask = ~indices_to_mask(curr_samples, X.shape[0]) if sum(mask) == 0: if oob: warn( "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1." ) mask = curr_samples # XXX todo: idem without dataframe X_oob = pd.DataFrame( (X[mask, :])[:, curr_features], columns=np.array(feature_names)[curr_features] ) if X_oob.shape[1] <= 1: # otherwise pandas bug (cf. issue #16363) return [] y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: scored_rules += [ Rule(r, args=_eval_rule_perf(r, X_oob, y_oob)) for r in set(curr_rules) ] return scored_rules
def _set_oob_score(self, X, y): n_samples = y.shape[0] n_classes_ = self.n_classes_ predictions = np.zeros((n_samples, n_classes_)) for estimator, samples, features in zip( self.estimators_, self.sequentially_bootstrapped_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~indices_to_mask(samples, n_samples) if hasattr(estimator, "predict_proba"): predictions[mask, :] += estimator.predict_proba( (X[mask, :])[:, features]) else: p = estimator.predict((X[mask, :])[:, features]) j = 0 for i in range(n_samples): if mask[i]: predictions[i, p[j]] += 1 j += 1 if (predictions.sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few estimators were used " "to compute any reliable oob estimates.") oob_decision_function = np.divide( predictions, predictions.sum(axis=1)[:, np.newaxis], out=np.zeros_like(predictions), where=predictions.sum(axis=1)[:, np.newaxis] != 0) oob_score = accuracy_score(y, np.argmax(predictions, axis=1)) self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds_features, seeds_samples, seeds_max_features, total_n_estimators, verbose, start_index, draw_max_features=False, circular_features=False): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] estimators_samples = [] estimators_splits = [] for i in range(n_estimators): if verbose > 2: print( "Building estimator %d of %d for this parallel run (total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state_max_features = np.random.RandomState( seeds_max_features[i]) random_state_features = np.random.RandomState(seeds_features[i]) random_state = np.random.RandomState(seeds_samples[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices if circular_features: n_features_window = ensemble.window_size max_features_window = max_features else: n_features_window = min(ensemble.window_size, n_features - start_index[i]) max_features_window = min(max_features, n_features - start_index[i]) features, indices = _generate_bagging_indices( random_state_features, random_state, random_state_max_features, bootstrap_features, bootstrap, n_features_window, n_samples, max_features_window, max_samples, draw_max_features=draw_max_features) features += start_index[i] # ensure not going outside range, take the first ones instead np.mod(features, n_features, out=features) # Draw samples, using sample weights, and then fit y_binary = random_binarizer(y) if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 0 estimator.fit(X[:, features], y_binary, sample_weight=curr_sample_weight) # Draw samples, using a mask, and then fit else: estimator.fit((X[indices])[:, features], y_binary[indices]) estimators.append(estimator) estimators_features.append(features) estimators_samples.append(indices) estimators_splits.append(y_binary) return estimators, estimators_features, estimators_samples, estimators_splits
def ospa_single(y_true, y_pred, minipatch=None): """ OSPA score on single patch. See docstring of `ospa` for more info. Parameters ---------- y_true, y_pred : ndarray of shape (3, x) arrays of (x, y, radius) minipatch : [row_min, row_max, col_min, col_max], optional Bounds of the internal scoring region (default is None) Returns ------- (iou_sum, n_pred, n_total) float - sum of ious of matched entries int - number of matched entries int - total number of entries """ n_true = len(y_true) n_pred = len(y_pred) # No craters and none found if n_true == 0 and n_pred == 0: return 0, 0, 0 # Mask of entries that lie within the minipatch if minipatch is not None: true_in_minipatch = _select_minipatch_tuples(y_true, minipatch) pred_in_minipatch = _select_minipatch_tuples(y_pred, minipatch) else: true_in_minipatch = np.ones(len(y_true)).astype(bool) pred_in_minipatch = np.ones(len(y_pred)).astype(bool) n_minipatch = true_in_minipatch.sum() + pred_in_minipatch.sum() # No craters and some found or existing craters but non found if n_true == 0 or n_pred == 0: return 0, 0, n_minipatch # First matching id_true, id_pred, ious = _match_tuples(y_true, y_pred) # For each set of entries (true and pred) create an array with # the iou corresponding to each object iou_true = np.zeros(len(y_true)) iou_true[id_true] = ious iou_pred = np.zeros(len(y_pred)) iou_pred[id_pred] = ious # Mask of matched entries true_matched = indices_to_mask(id_true, n_true) pred_matched = indices_to_mask(id_pred, n_pred) # Counting true_count = true_matched & true_in_minipatch pred_count = pred_matched & pred_in_minipatch # IoU computation on the final list iou_global = iou_true[true_count].sum() + iou_pred[pred_count].sum() n_count = true_count.sum() + pred_count.sum() return iou_global, n_count, n_minipatch
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This method needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) \ and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer") if not set(self.classes_) == set([0, 1]): warn("Found labels %s. This method assumes target class to be" " labeled as 1 and normal data to be labeled as 0. Any label" " different from 0 will be considered as being from the" " target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self.rules_ = {} self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] # default columns names : feature_names_ = [ BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str) ] if self.feature_names is not None: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names) } else: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_) } self.feature_names_ = feature_names_ clfs = [] regs = [] self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] for max_depth in self._max_depths: bagging_clf = BaggingClassifier( base_estimator=DecisionTreeClassifier( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_reg = BaggingRegressor( base_estimator=DecisionTreeRegressor( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) clfs.append(bagging_clf) regs.append(bagging_reg) # define regression target: if sample_weight is not None: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow( (weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging for clf in clfs: clf.fit(X, y) self.estimators_ += clf.estimators_ self.estimators_samples_ += clf.estimators_samples_ self.estimators_features_ += clf.estimators_features_ for reg in regs: reg.fit(X, y_reg) self.estimators_ += reg.estimators_ self.estimators_samples_ += reg.estimators_samples_ self.estimators_features_ += reg.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~indices_to_mask(samples, n_samples) if sum(mask) == 0: warn("OOB evaluation not possible: doing it in-bag." " Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to" " not perform well! Please use max_samples < 1.") mask = samples rules_from_tree = self._tree_to_rules( estimator, np.array(self.feature_names_)[features]) # XXX todo: idem without dataframe X_oob = pandas.DataFrame( (X[mask, :])[:, features], columns=np.array(self.feature_names_)[features]) if X_oob.shape[1] > 1: # otherwise pandas bug (cf. issue #16363) y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree)] rules_ += rules_from_tree # Factorize rules before semantic tree filtering rules_ = [ tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_] ] # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= self.precision_min and score[1] >= self.recall_min: if rule in self.rules_: # update the score to the new mean c = self.rules_[rule][2] + 1 b = self.rules_[rule][1] + 1. / c * (score[1] - self.rules_[rule][1]) a = self.rules_[rule][0] + 1. / c * (score[0] - self.rules_[rule][0]) self.rules_[rule] = (a, b, c) else: self.rules_[rule] = (score[0], score[1], 1) self.rules_ = sorted(self.rules_.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) # Deduplicate the rule using semantic tree if self.max_depth_duplication is not None: self.rules_ = self.deduplicate(self.rules_) self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
def bootstrap_prediction(X, y, score_func, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=None, random_state=None): """Bootstrap the scores from an `sklearn` estimator. Args: X (array-like, dtype=float64, , size=[n_samples, n_features]): Feature matrix. y (array, dtype=float64, size=[n_samples]): Target vector. score_func (callable): Score function (or loss function) with signature score_func(y, y_pred, **kwargs). base_estimator (object or None, optional): Defaults to None. The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. n_estimators (int, optional): Defaults to 10. The number of base estimators in the ensemble. max_samples (int or float, optional): Defaults to 1.0. The number of samples to draw from X to train each base estimator. If int, then draw max_samples samples. If float, then draw max_samples * X.shape[0] samples. max_features (int or float, optional): Defaults to 1.0. The number of features to draw from X to train each base estimator. If int, then draw max_features features. If float, then draw max_features * X.shape[1] features. bootstrap (bool, optional): Defaults to True. Whether samples are drawn with replacement. bootstrap_features (bool, optional): Defaults to False. Whether features are drawn with replacement. n_jobs (int or None, optional): Defaults to None. The number of jobs to run in parallel for both fit and predict. None means 1 unless in a joblib.parallel_backend context. random_state (int, RandomState instance or None, optional): Defaults to None. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Returns: numpy.ndarray: Distribution of score function statistic. """ bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, n_jobs=n_jobs, random_state=random_state) bag.fit(X, y) stats = [] for estimator, samples in zip(bag.estimators_, bag.estimators_samples_): # Create mask for OOB samples mask = ~indices_to_mask(samples, len(y)) # Compute predictions on out-of-bag samples y_pred = estimator.predict(X[mask]) # Compute statistic stat = score_func(y[mask], y_pred) stats.append(stat) stats = np.array(stats) return stats
def _get_support_mask(self): check_is_fitted(self, 'scores_') mask = indices_to_mask(self.indices_, self.num_features_) return mask