def test_has_fit_parameter(): assert not has_fit_parameter(KNeighborsClassifier, "sample_weight") assert has_fit_parameter(RandomForestRegressor, "sample_weight") assert has_fit_parameter(SVR, "sample_weight") assert has_fit_parameter(SVR(), "sample_weight") class TestClassWithDeprecatedFitMethod: @deprecated("Deprecated for the purpose of testing has_fit_parameter") def fit(self, X, y, sample_weight=None): pass assert has_fit_parameter(TestClassWithDeprecatedFitMethod, "sample_weight"), \ "has_fit_parameter fails for class with deprecated fit method."
def fit(self, X, y, sample_weight=None): """Fit a separate classifier for each output variable.""" for _, clf in self.classifiers: if not hasattr(clf, 'fit'): raise ValueError( 'Every base classifier should implement a fit method.') X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) if y.ndim == 1: raise ValueError( 'Output y must have at least two dimensions for multi-output classification but has only one.' ) if sample_weight is not None and any([ not has_fit_parameter(clf, 'sample_weight') for _, clf in self.classifiers ]): raise ValueError( 'One of base classifiers does not support sample weights.') self.classifiers_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)(clf, X, y[:, i], sample_weight) for i, (_, clf) in zip(range(y.shape[1]), self.classifiers)) return self
def fit(self, X, y, sample_weight=None): """Performs ``self.reweigher_.fit_transform(X, y, sample_weight)`` and then ``self.estimator_.fit(X, y, sample_weight)`` using the reweighed samples. Args: X (pandas.DataFrame): Training samples. y (array-like): Training labels. sample_weight (array-like, optional): Sample weights. Returns: self """ if not has_fit_parameter(self.estimator, 'sample_weight'): raise TypeError( "`estimator` (type: {}) does not have fit parameter" " `sample_weight`.".format(type(self.estimator))) self.reweigher_ = clone(self.reweigher) self.estimator_ = clone(self.estimator) X, sample_weight = self.reweigher_.fit_transform( X, y, sample_weight=sample_weight) self.estimator_.fit(X, y, sample_weight=sample_weight) return self
def fit(self, X, y, *, sample_weight=None, **kwargs): """Build the ensemble classifier from the training set (X, y).""" # Check random state self.random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) X, y = self._validate_data(X, y, **self.check_x_y_args) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) sample_weight /= sample_weight.sum() if np.any(sample_weight < 0): raise ValueError("sample_weight cannot contain negative weights") # Remap output n_samples, self.n_features_ = X.shape self.features_ = np.arange(self.n_features_) self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator(default=DecisionTreeClassifier()) # If the base estimator do not support sample weight and sample weight # is not None, raise an ValueError support_sample_weight = has_fit_parameter(self.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") self.estimators_, self.estimators_features_ = [], [] return self._fit(X, y, sample_weight=sample_weight, **kwargs)
def fit(self, X, y, sample_weight=None): """ Fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. An indicator matrix turns on multilabel estimation. sample_weight : array-like, shape = (n_samples) or \ (n_samples, n_outputs) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying estimator supports sample weights. Returns ------- self : object """ if not hasattr(self.estimator, "fit"): raise ValueError("The base estimator should implement" " a fit method") X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output estimator but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying estimator does not support" " sample weights.") if sample_weight is not None: sample_weight = np.asarray(sample_weight) if sample_weight is None or sample_weight.ndim == 1: sample_weight = [sample_weight] * y.shape[1] elif sample_weight.ndim == 2: sample_weight = sample_weight.T else: raise ValueError("sample weight must have at most two dimensions " "for multi-output estimator but has more than " "two.") self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator) (self.estimator, X, np.ascontiguousarray(y[:, i]), sample_weight[i]) for i in range(y.shape[1])) return self
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) # Draw samples, using sample weights, and then fit support_sample_weight=False if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 1.E-6 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) # Draw samples, using a mask, and then fit else: estimator.fit((X[indices])[:, features], y[indices]) estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" BaseWeightBoosting._validate_estimator(self, default=DecisionTreeClassifier(max_depth=1)) if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" BaseWeightBoosting._validate_estimator( self, default=DecisionTreeClassifier(max_depth=1)) if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def fit(self, X, Y, sample_weight=None, **fit_params): """Fit the model to data matrix X and targets Y. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) input data. Y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. An indicator matrix turns on multilabel estimation. sample_weight : array-like of shape (n_samples,) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying classifier supports sample weights. **fit_params : dict of string -> object Parameters passed to the ``estimator.fit`` method of each step. Returns ------- self : object """ self._validate_estimators() for _, est in self.estimators: if not hasattr(est, 'fit'): raise AttributeError( 'Every base estimator should implement a fit method.') X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(Y) if Y.ndim == 1: raise ValueError( 'Output Y must have at least two dimensions for multi-output classification but has only one.' ) if sample_weight is not None and any([ not has_fit_parameter(clf, 'sample_weight') for _, clf in self.estimators ]): raise ValueError( 'One of base estimators does not support sample weights.') fit_params_validated = _check_fit_params(X, fit_params) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)(clf, X, Y[:, i], sample_weight, ** fit_params_validated) for i, (_, clf) in zip(range(Y.shape[1]), self.estimators)) self.classes_ = [est.classes_ for est in self.estimators_] return self
def fit(self, X, y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if sample_weight is not None: for name, step in self.estimators: if not has_fit_parameter(step, 'sample_weight'): raise ValueError('Underlying estimator \'%s\' does not' ' support sample weights.' % name) names, clfs = zip(*self.estimators) self._validate_names(names) n_isnone = np.sum([clf is None for _, clf in self.estimators]) if n_isnone == len(self.estimators): raise ValueError('All estimators are None. At least one is ' 'required to be a classifier!') self.le_ = LabelEncoder().fit(y) self.classes_ = self.le_.classes_ self.estimators_ = [] transformed_y = self.le_.transform(y) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)( clone(clf), X, transformed_y, sample_weight=sample_weight) for clf in clfs if clf is not None) self.named_estimators_ = Bunch(**dict()) for k, e in zip(self.estimators, self.estimators_): self.named_estimators_[k[0]] = e return self
def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(AdaCC, self)._validate_estimator( default=DecisionTreeClassifier(max_depth=1, criterion='entropy')) # SAMME-R requires predict_proba-enabled base estimators if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) iP = [pair[0] for pair in enumerate(y) if pair[1] == 1] iU = [pair[0] for pair in enumerate(y) if pair[1] < 1] features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, len(iU), max_features, max_samples) indices = [iU[i] for i in indices] + iP if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) else: estimator.fit((X[indices])[:, features], y[indices]) estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def __init__(self, base_estimator=SVC(), n_iters=10, domain_column='domain', verbose=False): assert getattr(base_estimator, 'fit', None) is not None assert getattr(base_estimator, 'predict', None) is not None assert isinstance(n_iters, int) and n_iters > 0 assert has_fit_parameter(base_estimator, "sample_weight") self.base_estimator = base_estimator self.n_iters = n_iters self.verbose = verbose self.domain_column = domain_column
def partial_fit(self, X, y, classes=None, sample_weight=None): """Incrementally fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. classes : list of numpy arrays, shape (n_outputs) Each array is unique classes for one output in str/int Can be obtained by via ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the target matrix of the entire dataset. This argument is required for the first call to partial_fit and can be omitted in the subsequent calls. Note that y doesn't need to contain all labels in `classes`. sample_weight : array-like, shape = (n_samples) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying regressor supports sample weights. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output regression but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying estimator does not support" " sample weights.") first_time = not hasattr(self, 'estimators_') self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_partial_fit_estimator)( self.estimators_[i] if not first_time else self.estimator, X, y[:, i], classes[i] if classes is not None else None, sample_weight, first_time) for i in range(y.shape[1])) return self
def check_sample_weights_list(name, estimator_orig): # check that estimators will accept a 'sample_weight' parameter of # type list in the 'fit' function. if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 2 y = multioutput_estimator_convert_y_2d(estimator, y) sample_weight = [3] * 10 # Test that estimators don't raise any exception estimator.fit(X, y, sample_weight=sample_weight)
def _careful_parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): """ Modified from sklearn.ensemble._parallel_build_estimators() Private function used to build a batch of estimators within a job. """ # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) ''' UPDATED SAMPLING SECTION ''' # Draw random feature, sample indices features, indices = _generate_bagging_indices( random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) while len(np.unique(y[indices])) < 2: # Resample until training set is not single-class features, indices = _generate_bagging_indices( random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) # Don't use sample weights, to be compatible with LinearSVC estimator.fit((X[indices])[:, features], y[indices]) ''' END OF MODIFIED SECTION ''' estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(AdaCostClassifier, self)._validate_estimator(default=DecisionTreeClassifier(max_depth=1)) # SAMME-R requires predict_proba-enabled base estimators if self.algorithm == 'SAMME.R': if not hasattr(self.base_estimator_, 'predict_proba'): raise TypeError( "AdaCostClassifier with algorithm='SAMME.R' requires " "that the weak learner supports the calculation of class " "probabilities with a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead.") if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(AdaBoostClassifier, self)._validate_estimator( default=DecisionTreeClassifier(max_depth=1)) # SAMME-R requires predict_proba-enabled base estimators if self.algorithm == 'SAMME.R': if not hasattr(self.base_estimator_, 'predict_proba'): raise TypeError( "AdaBoostClassifier with algorithm='SAMME.R' requires " "that the weak learner supports the calculation of class " "probabilities with a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead.") if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def fit(self, X, y, sample_weight=None): """ Fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. An indicator matrix turns on multilabel estimation. sample_weight : array-like, shape = (n_samples) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying regressor supports sample weights. Returns ------- self : object Returns self. """ if not hasattr(self.estimator, "fit"): raise ValueError("The base estimator should implement a fit method") # Ignore this because it chokes on nas # X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) X, y = np.array(X), np.array(y) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi target regression but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying regressor does not support" " sample weights.") self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)( self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1])) return self
def compute_cache_classifier_predictions(X, y, sample_weights, estimator, population): support_predict_proba = hasattr(estimator, "predict_proba") support_sample_weight = has_fit_parameter(estimator, "sample_weight") for organizm in population: estimator.random_state = organizm.random_state if sample_weights and support_sample_weight: estimator.fit( X[organizm.genome_samples, :][:, organizm.genome_features], y[organizm.genome_samples], sample_weights[organizm.genome_samples], ) else: estimator.fit(X[organizm.genome_samples, :][:, organizm.genome_features], y[organizm.genome_samples]) if support_predict_proba: organizm.cache_predictions = estimator.predict_proba(X[:, organizm.genome_features]) else: predictions = estimator.predict(X[:, organizm.genome_features]) organizm.cache_predictions = np.zeros((predictions.shape[0], len(estimator.classes_))) for i in range(predictions.shape[0]): organizm.cache_predictions[i, predictions[i]] += 1
def fit(self, X, y, sample_weight=None): """ Fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. An indicator matrix turns on multilabel estimation. sample_weight : array-like, shape = (n_samples) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying regressor supports sample weights. Returns ------- self """ if not hasattr(self.estimator, "fit"): raise ValueError("The base estimator should implement a fit method") X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi target regression but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying regressor does not support" " sample weights.") self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)( self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1])) return self
def _grow(self, X, y, weights=None): """Grow and prune a Linear Tree from the training set (X, y). Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples, ) The target values (class labels in classification, real numbers in regression). weights : array-like of shape (n_samples, ), default=None Sample weights. If None, then samples are equally weighted. Note that if the base estimator does not support sample weighting, the sample weights are still used to evaluate the splits. Returns ------- self : object """ n_sample, self.n_features_in_ = X.shape self.feature_importances_ = np.zeros((self.n_features_in_, )) # extract quantiles bins = np.linspace(0, 1, self.max_bins)[1:-1] bins = np.quantile(X, bins, axis=0, interpolation="midpoint") bins = list(bins.T) bins = [ np.unique(X[:, c]) if c in self._categorical_features else np.unique(q) for c, q in enumerate(bins) ] # check if base_estimator supports fitting with sample_weights support_sample_weight = has_fit_parameter(self.base_estimator, "sample_weight") queue = [""] # queue of the nodes to evaluate for splitting # store the results of each node in dicts self._nodes = {} self._leaves = {} # initialize first fit largs = {"classes": None} model = deepcopy(self.base_estimator) if weights is None or not support_sample_weight: model.fit(X[:, self._linear_features], y) else: model.fit(X[:, self._linear_features], y, sample_weight=weights) if hasattr(self, "classes_"): largs["classes"] = self.classes_ loss = CRITERIA[self.criterion](model, X[:, self._linear_features], y, weights=weights, **largs) self._nodes[""] = Node(id=0, n_samples=n_sample, model=model, loss=loss, classes=largs["classes"]) # in the beginning consider all the samples start = np.repeat(True, n_sample) mask = start.copy() i = 1 while len(queue) > 0: if weights is None: split_t, split_col, left_node, right_node = self._split( X[mask], y[mask], bins, support_sample_weight, loss=loss) else: split_t, split_col, left_node, right_node = self._split( X[mask], y[mask], bins, support_sample_weight, weights[mask], loss=loss, ) # no utility in splitting if split_col is None or len(queue[-1]) >= self.max_depth: self._leaves[queue[-1]] = self._nodes[queue[-1]] del self._nodes[queue[-1]] queue.pop() else: model_left, loss_left, wloss_left, n_left, class_left = left_node model_right, loss_right, wloss_right, n_right, class_right = right_node self.feature_importances_[ split_col] += loss - wloss_left - wloss_right self._nodes[queue[-1] + "L"] = Node( id=i, parent=queue[-1], model=model_left, loss=loss_left, w_loss=wloss_left, n_samples=n_left, threshold=self._nodes[queue[-1]].threshold[:] + [(split_col, "L", split_t)], ) self._nodes[queue[-1] + "R"] = Node( id=i + 1, parent=queue[-1], model=model_right, loss=loss_right, w_loss=wloss_right, n_samples=n_right, threshold=self._nodes[queue[-1]].threshold[:] + [(split_col, "R", split_t)], ) if hasattr(self, "classes_"): self._nodes[queue[-1] + "L"].classes = class_left self._nodes[queue[-1] + "R"].classes = class_right self._nodes[queue[-1]].children = (queue[-1] + "L", queue[-1] + "R") i += 2 q = queue[-1] queue.pop() queue.extend([q + "R", q + "L"]) if len(queue) > 0: loss = self._nodes[queue[-1]].loss mask = _predict_branch(X, self._nodes[queue[-1]].threshold, start.copy()) self.node_count = i return self
def fit(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) # Convert data X, y = check_X_y(X, y, ["csr", "csc", "coo"]) # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.min_samples, (numbers.Integral, np.integer)): min_samples = self.min_samples else: # float min_samples = int(self.min_samples * X.shape[0]) if not (0 < min_samples <= X.shape[0]): raise ValueError("min_samples must be in (0, n_samples]") if isinstance(self.min_features, (numbers.Integral, np.integer)): min_features = self.min_features else: # float min_features = int(self.min_features * self.n_features_) if not (0 < min_features <= self.n_features_): raise ValueError("min_features must be in (0, n_features]") if self.min_estimators <= 0: raise ValueError("min_estimators must be greater than 0") support_predict_proba = hasattr(self.base_estimator_, "predict_proba") support_sample_weight = has_fit_parameter(self.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") self.estimators_ = [] accuracy_per_generation = np.zeros((self.tmax,), dtype=float) _best_accuracy = 0 _best_organizms = [] _populations = [] _contributions = [] _estimator = self._make_estimator(append=False) _offsprings = [Organizm(n_samples, self.n_features_, self.ps, self.pf, random_state) for _ in range(self.N1)] def _append_population(): population = [Organizm(n_samples, self.n_features_, self.ps, self.pf, random_state) for _ in range(self.N0)] _populations.append(population) _contributions.append(np.arange(self.d1, dtype=float)) self.compute_population_predictions(X, y, sample_weight, _estimator, population) def _remove_population(idx): del _populations[idx] del _contributions[idx] def _compute_cache_accuracy_contribution(fitness_func, population): accuracy_without_target = fitness_func() for organizm in population: organizm.cache_accuracy = fitness_func(organizm) organizm.cache_contribution = organizm.cache_accuracy - accuracy_without_target def genchoices(): res = set() while len(res) <= self.N1: first = random_state.randint(0, self.N0 - 1) second = random_state.randint(first + 1, self.N0) res.add((first, second)) return res for _ in range(self.min_estimators): _append_population() for generation in range(self.tmax): idx = 0 print("Generation #{0}".format(generation)) while idx < len(_populations): population = _populations[idx] fitness_func = self.get_estimator_fitness_func( [o[0] for i, o in enumerate(_populations) if i != idx], y, sample_weight ) # crossover _compute_cache_accuracy_contribution(fitness_func, population) choices = genchoices() for offspring, (first, second) in zip(_offsprings, choices): offspring.crossover(population[first], population[second]) offspring.mutation(self.pm) self.compute_population_predictions(X, y, sample_weight, _estimator, _offsprings) _compute_cache_accuracy_contribution(fitness_func, _offsprings) population.sort(reverse=True, key=lambda x: x.cache_accuracy) a = sorted(population[: self.N2] + _offsprings, reverse=True, key=lambda x: x.cache_accuracy) dead = population[self.N2 :] _populations[idx] = a[: self.N0] _offsprings = dead + a[self.N0 :] accuracy_per_generation[generation] = max(accuracy_per_generation[generation], a[0].cache_accuracy) print( "Estimator #{0} from {1}, accuracy {2}, contribution {3}".format( idx, len(_populations), a[0].cache_accuracy, a[0].cache_contribution ) ) _contributions[idx][generation % self.d1] = a[0].cache_contribution if len(_populations) > self.min_estimators and _contributions[idx].mean() < self.eps1: print("Estimator #{0} removed, contribution was {1}".format(idx, _contributions[idx].mean())) _remove_population(idx) else: idx += 1 if ( generation - self.d3 - 1 >= 0 and ( accuracy_per_generation[generation - self.d2 : generation].max() - accuracy_per_generation[: generation - self.d3].max() ) < self.eps3 ): print("Seems that adding new population doesn't helps, stopping...") break if ( generation - self.d2 - 1 >= 0 and ( accuracy_per_generation[generation - self.d2 : generation].max() - accuracy_per_generation[: generation - self.d2].max() ) < self.eps2 ): _append_population() print("Stagnation, let's add new population") self.estimators_ = [] self.estimators_features_ = [] self.estimators_weights_ = [] for population in _populations: organizm = population[0] estimator = self._make_estimator(append=False) estimator.random_state = organizm.random_state self.estimators_.append( estimator.fit( X[organizm.genome_samples, :][:, organizm.genome_features], y[organizm.genome_samples], sample_weight[organizm.genome_samples] if sample_weight is not None else None, ) ) self.estimators_features_.append(organizm.genome_features) self.estimators_weights_.append(organizm.cache_est_weight) return self
def fit(self, X, y, sample_weight=None): """Fit all base estimators. Parameters ---------- X : 2d numpy array or sparse matrix of shape [n_samples, n_features] Training data y : 1d numpy array of shape [n_samples] Target values. sample_weight : 1d numpy array of shape [n_samples] Individual weights for each sample. Passed to fit method of each estimator. Note: will be split automatically for each fold. Returns ------- self : object Fitted StackingTransformer instance. """ # --------------------------------------------------------------------- # Validation # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Check input data # --------------------------------------------------------------------- # Check X and y # ``check_estimator`` does not allow ``force_all_finite=False`` X, y = check_X_y(X, y, accept_sparse=['csr'], # allow csr, cast all others to csr force_all_finite=True, # do not allow nan and inf multi_output=False) # allow only one column in y_train # Check X and sample_weight # X is alredy checked, but we need it to compare length of sample_weight if sample_weight is not None: X, sample_weight = check_X_y(X, sample_weight, accept_sparse=['csr'], force_all_finite=True, multi_output=False) # --------------------------------------------------------------------- # Check ``estimators`` # --------------------------------------------------------------------- if self.estimators is None: if self.regression: self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))] else: self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))] # warnings.warn('No estimators were specified. ' # 'Using single dummy estimator as demo.', UserWarning) else: if 0 == len(self.estimators): raise ValueError('List of estimators is empty') else: # Clone self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators] # Check names of estimators names, estims = zip(*self.estimators_) self._validate_names(names) # Check if all estimators support ``sample_weight`` if sample_weight is not None: for name, estim in self.estimators_: if not has_fit_parameter(estim, 'sample_weight'): raise ValueError('Underlying estimator [%s] does not ' 'support sample weights.' % name) # --------------------------------------------------------------------- # Check other StackingTransformer parameters # --------------------------------------------------------------------- # ``variant`` if self.variant not in ['A', 'B']: raise ValueError('Parameter ``variant`` must be set properly') # ``n_folds`` if not isinstance(self.n_folds, int): raise ValueError('Parameter ``n_folds`` must be integer') if not self.n_folds > 1: raise ValueError('Parameter ``n_folds`` must be not less than 2') # ``verbose`` if self.verbose not in [0, 1, 2]: raise ValueError('Parameter ``verbose`` must be 0, 1, or 2') # Additional check for inapplicable parameter combinations # If ``regression=True`` we ignore classification-specific # parameters and issue user warning if self.regression and (self.needs_proba or self.stratified): warn_str = ('This is regression task hence classification-specific' 'parameters set to ``True`` were ignored:') if self.needs_proba: self.needs_proba = False warn_str += ' ``needs_proba``' if self.stratified: self.stratified = False warn_str += ' ``stratified``' warnings.warn(warn_str, UserWarning) # --------------------------------------------------------------------- # Compute attributes (basic properties of data, number of estimators, etc.) # --------------------------------------------------------------------- self.train_shape_ = X.shape self.n_train_examples_ = X.shape[0] self.n_features_ = X.shape[1] if not self.regression: self.n_classes_ = len(np.unique(y)) else: self.n_classes_ = None self.n_estimators_ = len(self.estimators_) self.train_footprint_ = self._get_footprint(X) # --------------------------------------------------------------------- # Specify default metric # --------------------------------------------------------------------- if self.metric is None and self.regression: self.metric_ = mean_absolute_error elif self.metric is None and not self.regression: if self.needs_proba: self.metric_ = log_loss else: self.metric_ = accuracy_score else: self.metric_ = self.metric # --------------------------------------------------------------------- # Create report header strings and print report header # --------------------------------------------------------------------- if self.verbose > 0: if self.regression: task_str = 'task: [regression]' else: task_str = 'task: [classification]' n_classes_str = 'n_classes: [%d]' % self.n_classes_ metric_str = 'metric: [%s]' % self.metric_.__name__ variant_str = 'variant: [%s]' % self.variant n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_ print(task_str) if not self.regression: print(n_classes_str) print(metric_str) print(variant_str) print(n_estimators_str + '\n') # --------------------------------------------------------------------- # Initialize cross-validation split # Stratified can be used only for classification # --------------------------------------------------------------------- if not self.regression and self.stratified: self.kf_ = StratifiedKFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) # Save target to be able to create stratified split in ``transform`` method # This is more efficient than to save split indices self._y_ = y.copy() else: self.kf_ = KFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) self._y_ = None # --------------------------------------------------------------------- # Compute implicit number of classes to create appropriate empty arrays. # !!! Important. In order to unify array creation # variable ``n_classes_implicit_`` is always equal to 1, except the case # when we performing classification task with ``needs_proba=True`` # --------------------------------------------------------------------- if not self.regression and self.needs_proba: self.n_classes_implicit_ = len(np.unique(y)) self.action_ = 'predict_proba' else: self.n_classes_implicit_ = 1 self.action_ = 'predict' # --------------------------------------------------------------------- # Create empty numpy array for train predictions (OOF) # !!! Important. We have to implicitly predict during fit # in order to compute CV scores, because # the most reasonable place to print out CV scores is fit method # --------------------------------------------------------------------- S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) # --------------------------------------------------------------------- # Prepare (clone) estmators for fitting and storing # We need models_A_ for both variant A and varian B # We need models_B_ for varian B only (in variant A attribute models_B_ is None) # --------------------------------------------------------------------- self.models_A_ = [] self.models_B_ = None for n, est in self.estimators_: self.models_A_.append([clone(est) for _ in range(self.n_folds)]) if self.variant in ['B']: self.models_B_ = [clone(est) for n, est in self.estimators_] # --------------------------------------------------------------------- # Create empty numpy array to store scores for each estimator and each fold # --------------------------------------------------------------------- self.scores_ = np.zeros((self.n_estimators_, self.n_folds)) # --------------------------------------------------------------------- # Create empty list to store name, mean and std for each estimator # --------------------------------------------------------------------- self.mean_std_ = [] # --------------------------------------------------------------------- # MAIN FIT PROCEDURE # --------------------------------------------------------------------- # Loop across estimators # --------------------------------------------------------------------- for estimator_counter, (name, estimator) in enumerate(self.estimators_): if self.verbose > 0: estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) print(estimator_str) # ----------------------------------------------------------------- # Loop across folds # ----------------------------------------------------------------- for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)): # Split data and target X_tr = X[tr_index] y_tr = y[tr_index] X_te = X[te_index] y_te = y[te_index] # Split sample weights accordingly (if passed) if sample_weight is not None: sample_weight_tr = sample_weight[tr_index] # sample_weight_te = sample_weight[te_index] else: sample_weight_tr = None # sample_weight_te = None # Fit estimator _ = self._estimator_action(self.models_A_[estimator_counter][fold_counter], X_tr, y_tr, None, sample_weight=sample_weight_tr, action='fit', transform=self.transform_target) # Predict out-of-fold part of train set if 'predict_proba' == self.action_: col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_estimator = estimator_counter S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter], None, None, X_te, action=self.action_, transform=self.transform_pred) # Compute score score = self.metric_(y_te, S_train[te_index, col_slice_estimator]) self.scores_[estimator_counter, fold_counter] = score # Print fold score if self.verbose > 1: fold_str = ' fold %2d: [%.8f]' % (fold_counter, score) print(fold_str) # Compute mean and std and save in dict estim_name = self.estimators_[estimator_counter][0] estim_mean = np.mean(self.scores_[estimator_counter]) estim_std = np.std(self.scores_[estimator_counter]) self.mean_std_.append((estim_name, estim_mean, estim_std)) if self.verbose > 1: sep_str = ' ----' print(sep_str) # Compute mean + std (and full) if self.verbose > 0: mean_str = ' MEAN: [%.8f] + [%.8f]\n' % (estim_mean, estim_std) print(mean_str) # Fit estimator on full train set if self.variant in ['B']: if self.verbose > 0: print(' Fitting on full train set...\n') _ = self._estimator_action(self.models_B_[estimator_counter], X, y, None, sample_weight=sample_weight, action='fit', transform=self.transform_target) # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Return fitted StackingTransformer instance return self
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features # Check if the base_estimator supports sample_weight base_estimator_ = ensemble.base_estimator_ while (isinstance(base_estimator_, skPipeline)): # for Pipelines base_estimator_ = base_estimator_._final_estimator support_sample_weight = has_fit_parameter(base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] estimators_n_training_samples = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() estimator.fit((X[indices])[:, features], y[indices], sample_weight=curr_sample_weight[indices]) else: estimator.fit((X[indices])[:, features], y[indices]) if hasattr(estimator, 'n_training_samples_'): n_training_samples = getattr(estimator, 'n_training_samples_') else: n_training_samples = len(indices) estimators.append(estimator) estimators_features.append(features) estimators_n_training_samples.append(n_training_samples) return estimators, estimators_features, estimators_n_training_samples
def partial_fit(self, X, Y, classes=None, sample_weight=None): """Incrementally fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) The input data. Y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. classes : list of numpy arrays, shape (n_outputs) Each array contains the unique classes for one output. Can be obtained by via ``[np.unique(Y[:, i]) for i in range(Y.shape[1])]``, where ``Y`` is the target matrix of the entire dataset. This argument is required for the first call to partial_fit and can be omitted in the subsequent calls. Note that ``Y`` doesn't need to contain all labels in ``classes``. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If ``None``, then samples are equally weighted. Only supported if the underlying classifier supports sample weights. Returns ------- self : object """ self._validate_estimators() for _, est in self.estimators: if not hasattr(est, 'partial_fit'): raise AttributeError( 'Every base estimator should implement a partial_fit method.' ) X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) if Y.ndim == 1: raise ValueError( 'Output Y must have at least two dimensions for multi-output classification but has only one.' ) if sample_weight is not None and any([ not has_fit_parameter(clf, 'sample_weight') for _, clf in self.estimators ]): raise ValueError( 'One of base estimators does not support sample weights.') first_time = not hasattr(self, 'estimators_') self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_partial_fit_estimator) (self.estimators_[i] if not first_time else clf, X, Y[:, i], classes[i] if classes is not None else None, sample_weight, first_time) for i, (_, clf) in zip(range(Y.shape[1]), self.estimators)) return self
def _check_sample_weight(self): if not has_fit_parameter(self.base_estimator_, "sample_weight"): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__)
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds_features, seeds_samples, seeds_max_features, total_n_estimators, verbose, start_index, draw_max_features=False, circular_features=False): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] estimators_samples = [] estimators_splits = [] for i in range(n_estimators): if verbose > 2: print( "Building estimator %d of %d for this parallel run (total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state_max_features = np.random.RandomState( seeds_max_features[i]) random_state_features = np.random.RandomState(seeds_features[i]) random_state = np.random.RandomState(seeds_samples[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices if circular_features: n_features_window = ensemble.window_size max_features_window = max_features else: n_features_window = min(ensemble.window_size, n_features - start_index[i]) max_features_window = min(max_features, n_features - start_index[i]) features, indices = _generate_bagging_indices( random_state_features, random_state, random_state_max_features, bootstrap_features, bootstrap, n_features_window, n_samples, max_features_window, max_samples, draw_max_features=draw_max_features) features += start_index[i] # ensure not going outside range, take the first ones instead np.mod(features, n_features, out=features) # Draw samples, using sample weights, and then fit y_binary = random_binarizer(y) if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() if bootstrap: sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 0 estimator.fit(X[:, features], y_binary, sample_weight=curr_sample_weight) # Draw samples, using a mask, and then fit else: estimator.fit((X[indices])[:, features], y_binary[indices]) estimators.append(estimator) estimators_features.append(features) estimators_samples.append(indices) estimators_splits.append(y_binary) return estimators, estimators_features, estimators_samples, estimators_splits
def fit(self, X, y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] or list of {array-like, sparse matrix} with shape = [n_samples, n_features], of length len(self.estimators). Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples,n_labels] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ if len(X) == 0: raise ValueError("X must contain at least one entry; got (X=%r)" % X) if self.voting not in ('soft', 'hard'): raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if self.weights and len(self.weights) != len(self.estimators): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) if sample_weight is not None: for name, step in self.estimators: if not has_fit_parameter(step, 'sample_weight'): raise ValueError('Underlying estimator \'%s\' does not support' ' sample weights.' % name) if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: self.multilabel_ = True self.le_ = MultiLabelBinarizer() self.le_.fit([range(y.shape[1])]) else: self.multilabel_ = False self.le_ = LabelEncoder() self.le_.fit(y) self.classes_ = self.le_.classes_ self.estimators_ = [] transformed_y = self.le_.transform(y) # Check whether we have X.shape = [n_samples,n_features] or X = [[n_samples,n_features_1],...,[n_samples,n_features_k]] measure = X[0] if not isinstance(measure, np.ndarray): measure = np.array(measure) self.multiple_features_ = len(measure.shape) == 2 if self.multiple_features_ and len(X) != len(self.estimators): raise ValueError("For voters requiring different data, X must be a list of" "data arrays, with the same length as the number of voters. Got X of length %s" % len(X)) if self.multiple_features_ and not isinstance(X,list): raise ValueError("For voters requiring different data, X must be a list of" "data arrays, with the same length as the number of voters. Got %s " % type(X)) if self.multiple_features_: self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), XX, transformed_y, sample_weight) for XX, _, clf in zip(X,*zip(*self.estimators))) else: self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y, sample_weight) for _, clf in self.estimators) return self
def fit(self, X, y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.voting not in ('soft', 'hard'): raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if self.selectors is None or len(self.selectors) == 0: raise AttributeError('Invalid `selectors` attribute, `selectors`' ' should be a list of (string, np.array)' ' tuples') if len(self.selectors) != len(self.estimators): raise ValueError('Number of selectors and estimators must be equal' '; got %d selectors, %d estimators' % (len(self.selectors), len(self.estimators))) if not isinstance(self.weights, type(None)) and len(self.weights) != len(self.estimators): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) if sample_weight is not None: for name, step in self.estimators: if not has_fit_parameter(step, 'sample_weight'): raise ValueError('Underlying estimator \'%s\' does not support' ' sample weights.' % name) self.le_ = LabelEncoder() self.le_.fit(y) self.classes_ = self.le_.classes_ self.estimators_ = [] transformed_y = self.le_.transform(y) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), X[:, self.named_selectors[name]], transformed_y, sample_weight) for name, clf in self.estimators) return self
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job. Now it supports queries and querywise sampling. It also breaks the PEP8 line length constraint now""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features uQueries = np.unique(Q) sample_whole_queries = False if hasattr(ensemble, "sample_whole_queries"): sample_whole_queries = ensemble.sample_whole_queries if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0): if sample_whole_queries: max_samples = int(max_samples * len(uQueries)) else: max_samples = int(max_samples * n_samples) if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: if sample_whole_queries: notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)] notQindices.sort() not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state ) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0.0 # Draw samples, using a mask, and then fit else: if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) else: if sample_whole_queries: Qindices = uQueries[ sample_without_replacement(len(uQueries), max_samples, random_state=random_state) ] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices]) samples = sample_counts > 0.0 estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def fit(self, X, y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ # if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: # raise NotImplementedError('Multilabel and multi-output' # ' classification is not supported.') if self.voting not in ('soft', 'hard'): raise ValueError( "Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if self.weights and len(self.weights) != len(self.estimators): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) if sample_weight is not None: for name, step in self.estimators: if not has_fit_parameter(step, 'sample_weight'): raise ValueError( 'Underlying estimator \'%s\' does not support' ' sample weights.' % name) # self.le_ = LabelEncoder() # self.le_.fit(y) # self.classes_ = self.le_.classes_ self.estimators_ = [] # transformed_y = self.le_.transform(y) transformed_y = y # self.estimators_ = Parallel(n_jobs=self.n_jobs)( # delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y, # sample_weight) # for _, clf in self.estimators) for name, clf in self.estimators: self.estimators_.append(clone(clf)) for clf in self.estimators_: clf.fit(X, transformed_y) # self.estimators_ = Parallel(n_jobs=self.n_jobs)( # delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y, # sample_weight) # for _, clf in self.estimators) return self
def ransac_fit_with_weights(self, X, y, sample_weight=None, residual_threshold=None): """ Modified sklearn.linear_model.RANSACRegressor.fit() sample_weight is used in sampling base points, fitting the regressor, and calculating score for candidate model """ X = check_array(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = LinearRegression() if self.min_samples is None: # assume linear model by default min_samples = X.shape[1] + 1 elif 0 < self.min_samples < 1: min_samples = np.ceil(self.min_samples * X.shape[0]) elif self.min_samples >= 1: if self.min_samples % 1 != 0: raise ValueError("Absolute number of samples must be an " "integer value.") min_samples = self.min_samples else: raise ValueError("Value for `min_samples` must be scalar and " "positive.") if min_samples > X.shape[0]: raise ValueError("`min_samples` may not be larger than number " "of samples: n_samples = %d." % (X.shape[0])) if self.stop_probability < 0 or self.stop_probability > 1: raise ValueError("`stop_probability` must be in range [0, 1].") if residual_threshold is None: if self.residual_threshold is None: # MAD (median absolute deviation) residual_threshold = np.median(np.abs(y - np.median(y))) else: residual_threshold = self.residual_threshold if self.loss == "absolute_loss": if y.ndim == 1: loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred) else: loss_function = lambda \ y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) elif self.loss == "squared_loss": if y.ndim == 1: loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2 else: loss_function = lambda \ y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1) elif callable(self.loss): loss_function = self.loss else: raise ValueError( "loss should be 'absolute_loss', 'squared_loss' or a callable." "Got %s. " % self.loss) random_state = check_random_state(self.random_state) try: # Not all estimator accept a random_state base_estimator.set_params(random_state=random_state) except ValueError: pass estimator_fit_has_sample_weight = has_fit_parameter(base_estimator, "sample_weight") estimator_name = type(base_estimator).__name__ if (sample_weight is not None and not estimator_fit_has_sample_weight): raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) if sample_weight is not None: sample_weight = np.asarray(sample_weight) n_inliers_best = 1 score_best = -np.inf inlier_mask_best = None X_inlier_best = None y_inlier_best = None weight_inlier_best = None self.n_skips_no_inliers_ = 0 self.n_skips_invalid_data_ = 0 self.n_skips_invalid_model_ = 0 # number of data samples n_samples = X.shape[0] sample_idxs = np.arange(n_samples) n_samples, _ = X.shape self.n_trials_ = 0 max_trials = self.max_trials while self.n_trials_ < max_trials: self.n_trials_ += 1 if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips: break # choose random sample set #subset_idxs = sample_without_replacement(n_samples, min_samples, # random_state=random_state) # use np.random.choice here since it allows sample with prob subset_idxs = np.random.choice(n_samples, min_samples, False, sample_weight / np.sum(sample_weight)) X_subset = X[subset_idxs] y_subset = y[subset_idxs] # check if random sample set is valid if (self.is_data_valid is not None and not self.is_data_valid(X_subset, y_subset)): self.n_skips_invalid_data_ += 1 continue # fit model for current random sample set if sample_weight is None: base_estimator.fit(X_subset, y_subset) else: base_estimator.fit(X_subset, y_subset, sample_weight=sample_weight[subset_idxs]) # check if estimated model is valid if (self.is_model_valid is not None and not self.is_model_valid(base_estimator, X_subset, y_subset)): self.n_skips_invalid_model_ += 1 continue # residuals of all data for current random sample model y_pred = base_estimator.predict(X) residuals_subset = loss_function(y, y_pred) # classify data into inliers and outliers inlier_mask_subset = residuals_subset < residual_threshold n_inliers_subset = np.sum(inlier_mask_subset) # less inliers -> skip current random sample if n_inliers_subset < n_inliers_best: self.n_skips_no_inliers_ += 1 continue # extract inlier data set inlier_idxs_subset = sample_idxs[inlier_mask_subset] X_inlier_subset = X[inlier_idxs_subset] y_inlier_subset = y[inlier_idxs_subset] if sample_weight is None: weight_inlier_subset = None else: weight_inlier_subset = sample_weight[inlier_idxs_subset] # score of inlier data set score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset, sample_weight[inlier_idxs_subset]) # same number of inliers but worse score -> skip current random # sample if (n_inliers_subset == n_inliers_best and score_subset < score_best): continue # save current random sample as best sample n_inliers_best = n_inliers_subset score_best = score_subset inlier_mask_best = inlier_mask_subset X_inlier_best = X_inlier_subset y_inlier_best = y_inlier_subset weight_inlier_best = weight_inlier_subset max_trials = min( max_trials, _dynamic_max_trials(n_inliers_best, n_samples, min_samples, self.stop_probability)) # break if sufficient number of inliers or score is reached if n_inliers_best >= self.stop_n_inliers or \ score_best >= self.stop_score: break # if none of the iterations met the required criteria if inlier_mask_best is None: if ((self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips): raise ValueError( "RANSAC skipped more iterations than `max_skips` without" " finding a valid consensus set. Iterations were skipped" " because each randomly chosen sub-sample failed the" " passing criteria. See estimator attributes for" " diagnostics (n_skips*).") else: raise ValueError( "RANSAC could not find a valid consensus set. All" " `max_trials` iterations were skipped because each" " randomly chosen sub-sample failed the passing criteria." " See estimator attributes for diagnostics (n_skips*).") else: if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips: warnings.warn("RANSAC found a valid consensus set but exited" " early due to skipping more iterations than" " `max_skips`. See estimator attributes for" " diagnostics (n_skips*).", ConvergenceWarning) # estimate final model using all inliers base_estimator.fit(X_inlier_best, y_inlier_best, weight_inlier_best) self.estimator_ = base_estimator self.inlier_mask_ = inlier_mask_best return self
def fitSegm(self, X, y, segmList, sample_weight=None): merged = list(itertools.chain.from_iterable(segmList)) X = check_array(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = LinearRegression() if self.min_samples is None: # assume linear model by default min_samples = X.shape[1] + 1 if len(segmList) < 2 else X.shape[ 1] #MINIMUM SAMPLES elif 0 < self.min_samples < 1: min_samples = np.ceil(self.min_samples * X.shape[0]) elif self.min_samples >= 1: if self.min_samples % 1 != 0: raise ValueError("Absolute number of samples must be an " "integer value.") min_samples = self.min_samples else: raise ValueError("Value for `min_samples` must be scalar and " "positive.") if min_samples > X.shape[0]: raise ValueError("`min_samples` may not be larger than number " "of samples: n_samples = %d." % (X.shape[0])) if self.stop_probability < 0 or self.stop_probability > 1: raise ValueError("`stop_probability` must be in range [0, 1].") if self.residual_threshold is None: # MAD (median absolute deviation) residual_threshold = np.percentile( np.abs(y - np.percentile(y, pervar)), pervar) else: residual_threshold = self.residual_threshold if self.loss == "absolute_loss": if y.ndim == 1: loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred) else: loss_function = lambda \ y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) elif self.loss == "squared_loss": if y.ndim == 1: loss_function = lambda y_true, y_pred: (y_true - y_pred)**2 else: loss_function = lambda \ y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1) elif callable(self.loss): loss_function = self.loss else: raise ValueError( "loss should be 'absolute_loss', 'squared_loss' or a callable." "Got %s. " % self.loss) random_state = check_random_state(self.random_state) try: # Not all estimator accept a random_state base_estimator.set_params(random_state=random_state) except ValueError: pass estimator_fit_has_sample_weight = has_fit_parameter( base_estimator, "sample_weight") estimator_name = type(base_estimator).__name__ if (sample_weight is not None and not estimator_fit_has_sample_weight): raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) if sample_weight is not None: sample_weight = np.asarray(sample_weight) n_inliers_best = 1 score_best = -np.inf inlier_mask_best = None X_inlier_best = None y_inlier_best = None aicc_ = None self.n_skips_no_inliers_ = 0 self.n_skips_invalid_data_ = 0 self.n_skips_invalid_model_ = 0 # Generate a list of indices for each segment size_sl = [len(s) - 1 for s in segmList] n_segments = len(size_sl) # number of data samples n_samples = X.shape[0] sample_idxs = np.arange(n_samples) n_samples, _ = X.shape self.n_trials_ = 0 max_trials = self.max_trials while self.n_trials_ < max_trials: self.n_trials_ += 1 if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips: break # choose random sample set ## antes: ## subset_idxs = sample_without_replacement(n_samples, min_samples, random_state=random_state) ## ahora: subset_idx_entries = sample_without_replacement( n_segments, min_samples, random_state=random_state) subset_idxs = np.asarray([segmList[ss][random.randint(0, size_sl[ss])] \ for ss in subset_idx_entries]) X_subset = X[subset_idxs] y_subset = y[subset_idxs] # check if random sample set is valid if (self.is_data_valid is not None and not self.is_data_valid(X_subset, y_subset)): self.n_skips_invalid_data_ += 1 continue # fit model for current random sample set if sample_weight is None: base_estimator.fit(X_subset, y_subset) else: base_estimator.fit(X_subset, y_subset, sample_weight=sample_weight[subset_idxs]) # check if estimated model is valid if (self.is_model_valid is not None and not self.is_model_valid( base_estimator, X_subset, y_subset)): self.n_skips_invalid_model_ += 1 continue # check if estimated model is valid (ii) y_pred_subset = base_estimator.predict(X_subset) residuals_ii = loss_function(y_subset, y_pred_subset) inlier_mask_subset_ii = residuals_ii < residual_threshold if np.sum(inlier_mask_subset_ii) < X.shape[1]: self.n_skips_invalid_model_ += 1 continue ########################## Inlier evaluation # residuals of all data for current random sample model y_pred = base_estimator.predict(X[merged]) residuals_subset = loss_function(y[merged], y_pred) # classify data into inliers and outliers inlier_mask_subset = residuals_subset < residual_threshold n_inliers_subset = np.sum(inlier_mask_subset) if False: print( f"n_inliers_subset {n_inliers_subset} from {inlier_mask_subset.shape}" ) # extract inlier data set inlier_idxs_subset = list(compress(merged, inlier_mask_subset)) X_inlier_subset = X[inlier_idxs_subset] y_inlier_subset = y[inlier_idxs_subset] if (False): # plain evaluation (basic approach) #check that the all points in sample are inliers if n_inliers_subset < min_samples: continue # less inliers -> skip current random sample if n_inliers_subset < n_inliers_best: self.n_skips_no_inliers_ += 1 continue # score of inlier data set score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset) # same number of inliers but worse score -> skip current random # sample if (n_inliers_subset == n_inliers_best and score_subset <= score_best): continue else: #evaluation for each calibration point indScore = 0 # score that considers inliers of each calibration point cc = 0 for sSeg, seg in zip(size_sl, segmList): c_seg = range(cc, sSeg + cc) #print(seg) #sys.exit(0) cc += sSeg # classify data into inliers and outliers nScore = np.sum(inlier_mask_subset[c_seg]) n_in_subset = nScore indScore += poisson.cdf(nScore, 0.3 * sSeg) if (indScore <= score_best): continue score_subset = indScore # save current random sample as best sample n_inliers_best = n_inliers_subset score_best = score_subset inlier_mask_best = inlier_mask_subset X_inlier_best = X_inlier_subset y_inlier_best = y_inlier_subset max_trials = min( max_trials, _dynamic_max_trials(n_inliers_best, n_samples, min_samples, self.stop_probability)) # break if sufficient number of inliers or score is reached if n_inliers_best >= self.stop_n_inliers or \ score_best >= self.stop_score: break # if none of the iterations met the required criteria if inlier_mask_best is None: base_estimator.coef_ = -999 if ((self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips): raise ValueError( "RANSAC skipped more iterations than `max_skips` without" " finding a valid consensus set. Iterations were skipped" " because each randomly chosen sub-sample failed the" " passing criteria. See estimator attributes for" " diagnostics (n_skips*).") else: raise ValueError( "RANSAC could not find a valid consensus set. All" " `max_trials` iterations were skipped because each" " randomly chosen sub-sample failed the passing criteria." " See estimator attributes for diagnostics (n_skips*).") else: if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_) > self.max_skips: warnings.warn( "RANSAC found a valid consensus set but exited" " early due to skipping more iterations than" " `max_skips`. See estimator attributes for" " diagnostics (n_skips*).", ConvergenceWarning) # estimate final model using all inliers base_estimator.fit(X_inlier_best, y_inlier_best) self.estimator_ = base_estimator self.inlier_mask_ = inlier_mask_best return self
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, total_n_estimators, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings sampler = ensemble.sampler n_samples, n_features = X.shape max_features = ensemble._max_features max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) random_state = np.random.RandomState(seeds[i]) estimator = ensemble._make_estimator(append=False, random_state=random_state) # Draw random feature, sample indices features, indices = _generate_bagging_indices(random_state, bootstrap_features, bootstrap, n_features, n_samples, max_features, max_samples) # Resample each bag if sampler == 'under': X_res, y_res = RandomUnderSampler( random_state=random_state).fit_sample(X, y) elif sampler == 'over': X_res, y_res = RandomOverSampler( random_state=random_state).fit_sample(X, y) elif sampler == 'smote': X_res, y_res = SMOTE(random_state=random_state, k_neighbors=5).fit_sample(X, y) # elif sampler == 'adasyn': # X_res, y_res = ADASYN(random_state=random_state, n_neighbors=5, # ).fit_sample(X, y) else: X_res, y_res = X, y if bootstrap: estimator.fit((X[indices])[:, features], y[indices]) else: estimator.fit(X_res[:, features], y_res) estimators.append(estimator) estimators_features.append(features) return estimators, estimators_features
def fit(self, X, Y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) if self.voting not in ('soft', 'hard'): raise ValueError( "Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) if (self.weights is not None and len(self.weights) != self.n_chains): raise ValueError("Number of classifiers and weights must be equal" '; got %d weights, %d chains' % (len(self.weights), len(self.n_chains))) if sample_weight is not None: if not has_fit_parameter(self.base_estimator, 'sample_weight'): raise ValueError("Underlying estimator \'%s\' does not" " support sample weights." % self.base_estimator.__class__.__name__) random_state = check_random_state(self.random_state) if self.orders is not None: if np.asarray(self.orders).shape != (self.n_chains, Y.shape[1]): raise ValueError("Argument orders must have shape " + "(n_chains, n_outputs); expected {}, " + "but got {}.".format( (self.n_chains, Y.shape[1]), self.orders.shape)) else: self.orders_ = self.orders else: self.orders_ = [ random_state.permutation(Y.shape[1]) for _ in range(self.n_chains) ] self.le_ = [] self.classes_ = [] for y in Y.T: le = LabelEncoder().fit(y) self.le_.append(le) self.classes_.append(le.classes_) self.chains_ = [ ClassifierChain(self.base_estimator, order=order, cv=self.cv) for order in self.orders_ ] self.chains_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(cc), X, Y, \ sample_weight) for cc in self.chains_)
def test_has_fit_parameter(): assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) assert_true(has_fit_parameter(SVR, "sample_weight")) assert_true(has_fit_parameter(SVR(), "sample_weight"))
def _spark_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" print "building estimators" # Retrieve settings X = X.value y = y.value ensemble = ensemble sample_weight = sample_weight.value n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices]) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features