def test_has_fit_parameter():
    assert not has_fit_parameter(KNeighborsClassifier, "sample_weight")
    assert has_fit_parameter(RandomForestRegressor, "sample_weight")
    assert has_fit_parameter(SVR, "sample_weight")
    assert has_fit_parameter(SVR(), "sample_weight")

    class TestClassWithDeprecatedFitMethod:
        @deprecated("Deprecated for the purpose of testing has_fit_parameter")
        def fit(self, X, y, sample_weight=None):
            pass

    assert has_fit_parameter(TestClassWithDeprecatedFitMethod,
                             "sample_weight"), \
        "has_fit_parameter fails for class with deprecated fit method."
def test_has_fit_parameter():
    assert not has_fit_parameter(KNeighborsClassifier, "sample_weight")
    assert has_fit_parameter(RandomForestRegressor, "sample_weight")
    assert has_fit_parameter(SVR, "sample_weight")
    assert has_fit_parameter(SVR(), "sample_weight")

    class TestClassWithDeprecatedFitMethod:
        @deprecated("Deprecated for the purpose of testing has_fit_parameter")
        def fit(self, X, y, sample_weight=None):
            pass

    assert has_fit_parameter(TestClassWithDeprecatedFitMethod,
                             "sample_weight"), \
        "has_fit_parameter fails for class with deprecated fit method."
Beispiel #3
0
    def fit(self, X, y, sample_weight=None):
        """Fit a separate classifier for each output variable."""

        for _, clf in self.classifiers:
            if not hasattr(clf, 'fit'):
                raise ValueError(
                    'Every base classifier should implement a fit method.')

        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError(
                'Output y must have at least two dimensions for multi-output classification but has only one.'
            )

        if sample_weight is not None and any([
                not has_fit_parameter(clf, 'sample_weight')
                for _, clf in self.classifiers
        ]):
            raise ValueError(
                'One of base classifiers does not support sample weights.')

        self.classifiers_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(clf, X, y[:, i], sample_weight)
            for i, (_, clf) in zip(range(y.shape[1]), self.classifiers))

        return self
Beispiel #4
0
    def fit(self, X, y, sample_weight=None):
        """Performs ``self.reweigher_.fit_transform(X, y, sample_weight)`` and
        then ``self.estimator_.fit(X, y, sample_weight)`` using the reweighed
        samples.

        Args:
            X (pandas.DataFrame): Training samples.
            y (array-like): Training labels.
            sample_weight (array-like, optional): Sample weights.

        Returns:
            self
        """
        if not has_fit_parameter(self.estimator, 'sample_weight'):
            raise TypeError(
                "`estimator` (type: {}) does not have fit parameter"
                " `sample_weight`.".format(type(self.estimator)))

        self.reweigher_ = clone(self.reweigher)
        self.estimator_ = clone(self.estimator)

        X, sample_weight = self.reweigher_.fit_transform(
            X, y, sample_weight=sample_weight)
        self.estimator_.fit(X, y, sample_weight=sample_weight)
        return self
Beispiel #5
0
    def fit(self, X, y, *, sample_weight=None, **kwargs):
        """Build the ensemble classifier from the training set (X, y)."""

        # Check random state
        self.random_state = check_random_state(self.random_state)

        # Convert data (X is required to be 2d and indexable)
        X, y = self._validate_data(X, y, **self.check_x_y_args)
        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)
            sample_weight /= sample_weight.sum()
            if np.any(sample_weight < 0):
                raise ValueError("sample_weight cannot contain negative weights")

        # Remap output
        n_samples, self.n_features_ = X.shape
        self.features_ = np.arange(self.n_features_)
        self._n_samples = n_samples
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator(default=DecisionTreeClassifier())
        
        # If the base estimator do not support sample weight and sample weight
        # is not None, raise an ValueError
        support_sample_weight = has_fit_parameter(self.base_estimator_,
                                                "sample_weight")
        if not support_sample_weight and sample_weight is not None:
            raise ValueError("The base estimator doesn't support sample weight")

        self.estimators_, self.estimators_features_ = [], []

        return self._fit(X, y, sample_weight=sample_weight, **kwargs)
Beispiel #6
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or \
                        (n_samples, n_outputs) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying estimator supports sample
            weights.

        Returns
        -------
        self : object
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError("The base estimator should implement"
                             " a fit method")

        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output estimator but has only one.")

        if (sample_weight is not None
                and not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
        if sample_weight is None or sample_weight.ndim == 1:
            sample_weight = [sample_weight] * y.shape[1]
        elif sample_weight.ndim == 2:
            sample_weight = sample_weight.T
        else:
            raise ValueError("sample weight must have at most two dimensions "
                             "for multi-output estimator but has more than "
                             "two.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)
            (self.estimator, X, np.ascontiguousarray(y[:,
                                                       i]), sample_weight[i])
            for i in range(y.shape[1]))
        return self
Beispiel #7
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      n_samples, max_features,
                                                      max_samples)

        # Draw samples, using sample weights, and then fit
        support_sample_weight=False
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = np.bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 1.E-6

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)

        # Draw samples, using a mask, and then fit
        else:
            estimator.fit((X[indices])[:, features], y[indices])

        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
Beispiel #8
0
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        BaseWeightBoosting._validate_estimator(self,
            default=DecisionTreeClassifier(max_depth=1))

        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight."
                             % self.base_estimator_.__class__.__name__)
Beispiel #9
0
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        BaseWeightBoosting._validate_estimator(
            self, default=DecisionTreeClassifier(max_depth=1))

        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight." %
                             self.base_estimator_.__class__.__name__)
Beispiel #10
0
    def fit(self, X, Y, sample_weight=None, **fit_params):
        """Fit the model to data matrix X and targets Y.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            input data.

        Y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like of shape (n_samples,) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying classifier supports sample
            weights.

        **fit_params : dict of string -> object
            Parameters passed to the ``estimator.fit`` method of each step.

        Returns
        -------
        self : object
        """
        self._validate_estimators()

        for _, est in self.estimators:
            if not hasattr(est, 'fit'):
                raise AttributeError(
                    'Every base estimator should implement a fit method.')

        X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(Y)

        if Y.ndim == 1:
            raise ValueError(
                'Output Y must have at least two dimensions for multi-output classification but has only one.'
            )

        if sample_weight is not None and any([
                not has_fit_parameter(clf, 'sample_weight')
                for _, clf in self.estimators
        ]):
            raise ValueError(
                'One of base estimators does not support sample weights.')

        fit_params_validated = _check_fit_params(X, fit_params)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(clf, X, Y[:, i], sample_weight, **
                                    fit_params_validated)
            for i, (_, clf) in zip(range(Y.shape[1]), self.estimators))

        self.classes_ = [est.classes_ for est in self.estimators_]

        return self
Beispiel #11
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

        Returns
        -------
        self : object
        """
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError('Underlying estimator \'%s\' does not'
                                     ' support sample weights.' % name)
        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_fit_estimator)(
                clone(clf), X, transformed_y, sample_weight=sample_weight)
            for clf in clfs if clf is not None)

        self.named_estimators_ = Bunch(**dict())
        for k, e in zip(self.estimators, self.estimators_):
            self.named_estimators_[k[0]] = e
        return self
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super(AdaCC, self)._validate_estimator(
            default=DecisionTreeClassifier(max_depth=1, criterion='entropy'))

        #  SAMME-R requires predict_proba-enabled base estimators
        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight." %
                             self.base_estimator_.__class__.__name__)
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):

    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        iP = [pair[0] for pair in enumerate(y) if pair[1] == 1]
        iU = [pair[0] for pair in enumerate(y) if pair[1] < 1]
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      len(iU), max_features,
                                                      max_samples)
        indices = [iU[i] for i in indices] + iP

        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = np.bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)

        else:
            estimator.fit((X[indices])[:, features], y[indices])

        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
Beispiel #14
0
    def __init__(self, base_estimator=SVC(), n_iters=10, domain_column='domain', verbose=False):
        assert getattr(base_estimator, 'fit', None) is not None
        assert getattr(base_estimator, 'predict', None) is not None
        assert isinstance(n_iters, int) and n_iters > 0
        assert has_fit_parameter(base_estimator, "sample_weight")

        self.base_estimator = base_estimator
        self.n_iters = n_iters
        self.verbose = verbose
        self.domain_column = domain_column
Beispiel #15
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incrementally fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets.

        classes : list of numpy arrays, shape (n_outputs)
            Each array is unique classes for one output in str/int
            Can be obtained by via
            ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the
            target matrix of the entire dataset.
            This argument is required for the first call to partial_fit
            and can be omitted in the subsequent calls.
            Note that y doesn't need to contain all labels in `classes`.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y,
                         multi_output=True,
                         accept_sparse=True)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        if (sample_weight is not None and
                not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        first_time = not hasattr(self, 'estimators_')

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_estimator)(
                self.estimators_[i] if not first_time else self.estimator,
                X, y[:, i],
                classes[i] if classes is not None else None,
                sample_weight, first_time) for i in range(y.shape[1]))
        return self
Beispiel #16
0
def check_sample_weights_list(name, estimator_orig):
    # check that estimators will accept a 'sample_weight' parameter of
    # type list in the 'fit' function.
    if has_fit_parameter(estimator_orig, "sample_weight"):
        estimator = clone(estimator_orig)
        rnd = np.random.RandomState(0)
        X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)),
                                         estimator_orig)
        y = np.arange(10) % 2
        y = multioutput_estimator_convert_y_2d(estimator, y)
        sample_weight = [3] * 10
        # Test that estimators don't raise any exception
        estimator.fit(X, y, sample_weight=sample_weight)
Beispiel #17
0
def _careful_parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):
    """
    Modified from sklearn.ensemble._parallel_build_estimators()
    
    Private function used to build a batch of estimators within a job.
    """
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)
        
        ''' UPDATED SAMPLING SECTION '''
        # Draw random feature, sample indices
        features, indices = _generate_bagging_indices(
            random_state, bootstrap_features, bootstrap, n_features,
            n_samples, max_features, max_samples)
        
        while len(np.unique(y[indices])) < 2:
            # Resample until training set is not single-class
            features, indices = _generate_bagging_indices(
                random_state, bootstrap_features, bootstrap, n_features,
                n_samples, max_features, max_samples)
            
        # Don't use sample weights, to be compatible with LinearSVC
        estimator.fit((X[indices])[:, features], y[indices])

        ''' END OF MODIFIED SECTION '''
        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
Beispiel #18
0
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super(AdaCostClassifier, self)._validate_estimator(default=DecisionTreeClassifier(max_depth=1))

        #  SAMME-R requires predict_proba-enabled base estimators
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator_, 'predict_proba'):
                raise TypeError(
                    "AdaCostClassifier with algorithm='SAMME.R' requires "
                    "that the weak learner supports the calculation of class "
                    "probabilities with a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")
        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight."
                             % self.base_estimator_.__class__.__name__)
Beispiel #19
0
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super(AdaBoostClassifier, self)._validate_estimator(
            default=DecisionTreeClassifier(max_depth=1))

        #  SAMME-R requires predict_proba-enabled base estimators
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator_, 'predict_proba'):
                raise TypeError(
                    "AdaBoostClassifier with algorithm='SAMME.R' requires "
                    "that the weak learner supports the calculation of class "
                    "probabilities with a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")
        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight."
                             % self.base_estimator_.__class__.__name__)
Beispiel #20
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
            Returns self.
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError("The base estimator should implement a fit method")

        # Ignore this because it chokes on nas
        # X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)
        X, y = np.array(X), np.array(y)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi target regression but has only one.")

        if (sample_weight is not None and
                not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying regressor does not support"
                             " sample weights.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(
            self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1]))
        return self
Beispiel #21
0
def compute_cache_classifier_predictions(X, y, sample_weights, estimator, population):
    support_predict_proba = hasattr(estimator, "predict_proba")
    support_sample_weight = has_fit_parameter(estimator, "sample_weight")
    for organizm in population:
        estimator.random_state = organizm.random_state
        if sample_weights and support_sample_weight:
            estimator.fit(
                X[organizm.genome_samples, :][:, organizm.genome_features],
                y[organizm.genome_samples],
                sample_weights[organizm.genome_samples],
            )
        else:
            estimator.fit(X[organizm.genome_samples, :][:, organizm.genome_features], y[organizm.genome_samples])
        if support_predict_proba:
            organizm.cache_predictions = estimator.predict_proba(X[:, organizm.genome_features])
        else:
            predictions = estimator.predict(X[:, organizm.genome_features])
            organizm.cache_predictions = np.zeros((predictions.shape[0], len(estimator.classes_)))
            for i in range(predictions.shape[0]):
                organizm.cache_predictions[i, predictions[i]] += 1
Beispiel #22
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.
        Returns
        -------
        self
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError("The base estimator should implement a fit method")

        X, y = check_X_y(X, y,
                         multi_output=True,
                         accept_sparse=True)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi target regression but has only one.")

        if (sample_weight is not None and
                not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying regressor does not support"
                             " sample weights.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(
            self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1]))
        return self
Beispiel #23
0
    def _grow(self, X, y, weights=None):
        """Grow and prune a Linear Tree from the training set (X, y).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples, )
            The target values (class labels in classification, real numbers in
            regression).

        weights : array-like of shape (n_samples, ), default=None
            Sample weights. If None, then samples are equally weighted.
            Note that if the base estimator does not support sample weighting,
            the sample weights are still used to evaluate the splits.

        Returns
        -------
        self : object
        """
        n_sample, self.n_features_in_ = X.shape
        self.feature_importances_ = np.zeros((self.n_features_in_, ))

        # extract quantiles
        bins = np.linspace(0, 1, self.max_bins)[1:-1]
        bins = np.quantile(X, bins, axis=0, interpolation="midpoint")
        bins = list(bins.T)
        bins = [
            np.unique(X[:, c])
            if c in self._categorical_features else np.unique(q)
            for c, q in enumerate(bins)
        ]

        # check if base_estimator supports fitting with sample_weights
        support_sample_weight = has_fit_parameter(self.base_estimator,
                                                  "sample_weight")

        queue = [""]  # queue of the nodes to evaluate for splitting
        # store the results of each node in dicts
        self._nodes = {}
        self._leaves = {}

        # initialize first fit
        largs = {"classes": None}
        model = deepcopy(self.base_estimator)
        if weights is None or not support_sample_weight:
            model.fit(X[:, self._linear_features], y)
        else:
            model.fit(X[:, self._linear_features], y, sample_weight=weights)

        if hasattr(self, "classes_"):
            largs["classes"] = self.classes_

        loss = CRITERIA[self.criterion](model,
                                        X[:, self._linear_features],
                                        y,
                                        weights=weights,
                                        **largs)

        self._nodes[""] = Node(id=0,
                               n_samples=n_sample,
                               model=model,
                               loss=loss,
                               classes=largs["classes"])

        # in the beginning consider all the samples
        start = np.repeat(True, n_sample)
        mask = start.copy()

        i = 1
        while len(queue) > 0:

            if weights is None:
                split_t, split_col, left_node, right_node = self._split(
                    X[mask], y[mask], bins, support_sample_weight, loss=loss)
            else:
                split_t, split_col, left_node, right_node = self._split(
                    X[mask],
                    y[mask],
                    bins,
                    support_sample_weight,
                    weights[mask],
                    loss=loss,
                )

            # no utility in splitting
            if split_col is None or len(queue[-1]) >= self.max_depth:
                self._leaves[queue[-1]] = self._nodes[queue[-1]]
                del self._nodes[queue[-1]]
                queue.pop()

            else:

                model_left, loss_left, wloss_left, n_left, class_left = left_node
                model_right, loss_right, wloss_right, n_right, class_right = right_node
                self.feature_importances_[
                    split_col] += loss - wloss_left - wloss_right

                self._nodes[queue[-1] + "L"] = Node(
                    id=i,
                    parent=queue[-1],
                    model=model_left,
                    loss=loss_left,
                    w_loss=wloss_left,
                    n_samples=n_left,
                    threshold=self._nodes[queue[-1]].threshold[:] +
                    [(split_col, "L", split_t)],
                )

                self._nodes[queue[-1] + "R"] = Node(
                    id=i + 1,
                    parent=queue[-1],
                    model=model_right,
                    loss=loss_right,
                    w_loss=wloss_right,
                    n_samples=n_right,
                    threshold=self._nodes[queue[-1]].threshold[:] +
                    [(split_col, "R", split_t)],
                )

                if hasattr(self, "classes_"):
                    self._nodes[queue[-1] + "L"].classes = class_left
                    self._nodes[queue[-1] + "R"].classes = class_right

                self._nodes[queue[-1]].children = (queue[-1] + "L",
                                                   queue[-1] + "R")

                i += 2
                q = queue[-1]
                queue.pop()
                queue.extend([q + "R", q + "L"])

            if len(queue) > 0:
                loss = self._nodes[queue[-1]].loss
                mask = _predict_branch(X, self._nodes[queue[-1]].threshold,
                                       start.copy())

        self.node_count = i

        return self
Beispiel #24
0
    def fit(self, X, y, sample_weight=None):
        random_state = check_random_state(self.random_state)

        # Convert data
        X, y = check_X_y(X, y, ["csr", "csc", "coo"])

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.min_samples, (numbers.Integral, np.integer)):
            min_samples = self.min_samples
        else:  # float
            min_samples = int(self.min_samples * X.shape[0])

        if not (0 < min_samples <= X.shape[0]):
            raise ValueError("min_samples must be in (0, n_samples]")

        if isinstance(self.min_features, (numbers.Integral, np.integer)):
            min_features = self.min_features
        else:  # float
            min_features = int(self.min_features * self.n_features_)

        if not (0 < min_features <= self.n_features_):
            raise ValueError("min_features must be in (0, n_features]")

        if self.min_estimators <= 0:
            raise ValueError("min_estimators must be greater than 0")

        support_predict_proba = hasattr(self.base_estimator_, "predict_proba")
        support_sample_weight = has_fit_parameter(self.base_estimator_, "sample_weight")
        if not support_sample_weight and sample_weight is not None:
            raise ValueError("The base estimator doesn't support sample weight")

        self.estimators_ = []

        accuracy_per_generation = np.zeros((self.tmax,), dtype=float)
        _best_accuracy = 0
        _best_organizms = []
        _populations = []
        _contributions = []
        _estimator = self._make_estimator(append=False)
        _offsprings = [Organizm(n_samples, self.n_features_, self.ps, self.pf, random_state) for _ in range(self.N1)]

        def _append_population():
            population = [Organizm(n_samples, self.n_features_, self.ps, self.pf, random_state) for _ in range(self.N0)]
            _populations.append(population)
            _contributions.append(np.arange(self.d1, dtype=float))
            self.compute_population_predictions(X, y, sample_weight, _estimator, population)

        def _remove_population(idx):
            del _populations[idx]
            del _contributions[idx]

        def _compute_cache_accuracy_contribution(fitness_func, population):
            accuracy_without_target = fitness_func()
            for organizm in population:
                organizm.cache_accuracy = fitness_func(organizm)
                organizm.cache_contribution = organizm.cache_accuracy - accuracy_without_target

        def genchoices():
            res = set()
            while len(res) <= self.N1:
                first = random_state.randint(0, self.N0 - 1)
                second = random_state.randint(first + 1, self.N0)
                res.add((first, second))
            return res

        for _ in range(self.min_estimators):
            _append_population()

        for generation in range(self.tmax):
            idx = 0
            print("Generation #{0}".format(generation))
            while idx < len(_populations):
                population = _populations[idx]
                fitness_func = self.get_estimator_fitness_func(
                    [o[0] for i, o in enumerate(_populations) if i != idx], y, sample_weight
                )
                # crossover
                _compute_cache_accuracy_contribution(fitness_func, population)

                choices = genchoices()
                for offspring, (first, second) in zip(_offsprings, choices):
                    offspring.crossover(population[first], population[second])
                    offspring.mutation(self.pm)

                self.compute_population_predictions(X, y, sample_weight, _estimator, _offsprings)
                _compute_cache_accuracy_contribution(fitness_func, _offsprings)
                population.sort(reverse=True, key=lambda x: x.cache_accuracy)
                a = sorted(population[: self.N2] + _offsprings, reverse=True, key=lambda x: x.cache_accuracy)
                dead = population[self.N2 :]
                _populations[idx] = a[: self.N0]
                _offsprings = dead + a[self.N0 :]
                accuracy_per_generation[generation] = max(accuracy_per_generation[generation], a[0].cache_accuracy)
                print(
                    "Estimator #{0} from {1}, accuracy {2}, contribution {3}".format(
                        idx, len(_populations), a[0].cache_accuracy, a[0].cache_contribution
                    )
                )
                _contributions[idx][generation % self.d1] = a[0].cache_contribution

                if len(_populations) > self.min_estimators and _contributions[idx].mean() < self.eps1:
                    print("Estimator #{0} removed, contribution was {1}".format(idx, _contributions[idx].mean()))
                    _remove_population(idx)
                else:
                    idx += 1

            if (
                generation - self.d3 - 1 >= 0
                and (
                    accuracy_per_generation[generation - self.d2 : generation].max()
                    - accuracy_per_generation[: generation - self.d3].max()
                )
                < self.eps3
            ):
                print("Seems that adding new population doesn't helps, stopping...")
                break

            if (
                generation - self.d2 - 1 >= 0
                and (
                    accuracy_per_generation[generation - self.d2 : generation].max()
                    - accuracy_per_generation[: generation - self.d2].max()
                )
                < self.eps2
            ):
                _append_population()
                print("Stagnation, let's add new population")

        self.estimators_ = []
        self.estimators_features_ = []
        self.estimators_weights_ = []

        for population in _populations:
            organizm = population[0]
            estimator = self._make_estimator(append=False)
            estimator.random_state = organizm.random_state
            self.estimators_.append(
                estimator.fit(
                    X[organizm.genome_samples, :][:, organizm.genome_features],
                    y[organizm.genome_samples],
                    sample_weight[organizm.genome_samples] if sample_weight is not None else None,
                )
            )
            self.estimators_features_.append(organizm.genome_features)
            self.estimators_weights_.append(organizm.cache_est_weight)

        return self
Beispiel #25
0
    def fit(self, X, y, sample_weight=None):
        """Fit all base estimators.

        Parameters
        ----------
        X : 2d numpy array or sparse matrix of shape [n_samples, n_features]
            Training data
        y : 1d numpy array of shape [n_samples]
            Target values.
        sample_weight : 1d numpy array of shape [n_samples]
            Individual weights for each sample.
            Passed to fit method of each estimator.
            Note: will be split automatically for each fold.

        Returns
        -------
        self : object
            Fitted StackingTransformer instance.
        """
        # ---------------------------------------------------------------------
        # Validation
        # ---------------------------------------------------------------------

        # ---------------------------------------------------------------------
        # Check input data
        # ---------------------------------------------------------------------
        # Check X and y
        # ``check_estimator`` does not allow ``force_all_finite=False``
        X, y = check_X_y(X, y,
                         accept_sparse=['csr'],  # allow csr, cast all others to csr
                         force_all_finite=True,  # do not allow  nan and inf
                         multi_output=False)  # allow only one column in y_train

        # Check X and sample_weight
        # X is alredy checked, but we need it to compare length of sample_weight
        if sample_weight is not None:
            X, sample_weight = check_X_y(X, sample_weight,
                                         accept_sparse=['csr'],
                                         force_all_finite=True,
                                         multi_output=False)

        # ---------------------------------------------------------------------
        # Check ``estimators``
        # ---------------------------------------------------------------------
        if self.estimators is None:
            if self.regression:
                self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))]
            else:
                self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))]
            # warnings.warn('No estimators were specified. '
            #               'Using single dummy estimator as demo.', UserWarning)
        else:
            if 0 == len(self.estimators):
                raise ValueError('List of estimators is empty')
            else:
                # Clone
                self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators]
                # Check names of estimators
                names, estims = zip(*self.estimators_)
                self._validate_names(names)
                # Check if all estimators support ``sample_weight``
                if sample_weight is not None:
                    for name, estim in self.estimators_:
                        if not has_fit_parameter(estim, 'sample_weight'):
                            raise ValueError('Underlying estimator [%s] does not '
                                             'support sample weights.' % name)

        # ---------------------------------------------------------------------
        # Check other StackingTransformer parameters
        # ---------------------------------------------------------------------

        # ``variant``
        if self.variant not in ['A', 'B']:
            raise ValueError('Parameter ``variant`` must be set properly')

        # ``n_folds``
        if not isinstance(self.n_folds, int):
            raise ValueError('Parameter ``n_folds`` must be integer')
        if not self.n_folds > 1:
            raise ValueError('Parameter ``n_folds`` must be not less than 2')

        # ``verbose``
        if self.verbose not in [0, 1, 2]:
            raise ValueError('Parameter ``verbose`` must be 0, 1, or 2')

        # Additional check for inapplicable parameter combinations
        # If ``regression=True`` we ignore classification-specific
        # parameters and issue user warning
        if self.regression and (self.needs_proba or self.stratified):
            warn_str = ('This is regression task hence classification-specific'
                        'parameters set to ``True`` were ignored:')
            if self.needs_proba:
                self.needs_proba = False
                warn_str += ' ``needs_proba``'
            if self.stratified:
                self.stratified = False
                warn_str += ' ``stratified``'
            warnings.warn(warn_str, UserWarning)

        # ---------------------------------------------------------------------
        # Compute attributes (basic properties of data, number of estimators, etc.)
        # ---------------------------------------------------------------------
        self.train_shape_ = X.shape
        self.n_train_examples_ = X.shape[0]
        self.n_features_ = X.shape[1]
        if not self.regression:
            self.n_classes_ = len(np.unique(y))
        else:
            self.n_classes_ = None
        self.n_estimators_ = len(self.estimators_)
        self.train_footprint_ = self._get_footprint(X)

        # ---------------------------------------------------------------------
        # Specify default metric
        # ---------------------------------------------------------------------
        if self.metric is None and self.regression:
            self.metric_ = mean_absolute_error
        elif self.metric is None and not self.regression:
            if self.needs_proba:
                self.metric_ = log_loss
            else:
                self.metric_ = accuracy_score
        else:
            self.metric_ = self.metric
        # ---------------------------------------------------------------------
        # Create report header strings and print report header
        # ---------------------------------------------------------------------
        if self.verbose > 0:
            if self.regression:
                task_str = 'task:         [regression]'
            else:
                task_str = 'task:         [classification]'
                n_classes_str = 'n_classes:    [%d]' % self.n_classes_
            metric_str = 'metric:       [%s]' % self.metric_.__name__
            variant_str = 'variant:      [%s]' % self.variant
            n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_

            print(task_str)
            if not self.regression:
                print(n_classes_str)
            print(metric_str)
            print(variant_str)
            print(n_estimators_str + '\n')
        # ---------------------------------------------------------------------
        # Initialize cross-validation split
        # Stratified can be used only for classification
        # ---------------------------------------------------------------------
        if not self.regression and self.stratified:
            self.kf_ = StratifiedKFold(n_splits=self.n_folds,
                                       shuffle=self.shuffle,
                                       random_state=self.random_state)
            # Save target to be able to create stratified split in ``transform`` method
            # This is more efficient than to save split indices
            self._y_ = y.copy()
        else:
            self.kf_ = KFold(n_splits=self.n_folds,
                             shuffle=self.shuffle,
                             random_state=self.random_state)
            self._y_ = None

        # ---------------------------------------------------------------------
        # Compute implicit number of classes to create appropriate empty arrays.
        # !!! Important. In order to unify array creation
        # variable ``n_classes_implicit_`` is always equal to 1, except the case
        # when we performing classification task with ``needs_proba=True``
        # ---------------------------------------------------------------------
        if not self.regression and self.needs_proba:
            self.n_classes_implicit_ = len(np.unique(y))
            self.action_ = 'predict_proba'
        else:
            self.n_classes_implicit_ = 1
            self.action_ = 'predict'

        # ---------------------------------------------------------------------
        # Create empty numpy array for train predictions (OOF)
        # !!! Important. We have to implicitly predict during fit
        # in order to compute CV scores, because
        # the most reasonable place to print out CV scores is fit method
        # ---------------------------------------------------------------------
        S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_))

        # ---------------------------------------------------------------------
        # Prepare (clone) estmators for fitting and storing
        # We need models_A_ for both variant A and varian B
        # We need models_B_ for varian B only (in variant A attribute models_B_ is None)
        # ---------------------------------------------------------------------

        self.models_A_ = []
        self.models_B_ = None

        for n, est in self.estimators_:
            self.models_A_.append([clone(est) for _ in range(self.n_folds)])

        if self.variant in ['B']:
            self.models_B_ = [clone(est) for n, est in self.estimators_]

        # ---------------------------------------------------------------------
        # Create empty numpy array to store scores for each estimator and each fold
        # ---------------------------------------------------------------------
        self.scores_ = np.zeros((self.n_estimators_, self.n_folds))

        # ---------------------------------------------------------------------
        # Create empty list to store name, mean and std for each estimator
        # ---------------------------------------------------------------------
        self.mean_std_ = []

        # ---------------------------------------------------------------------
        # MAIN FIT PROCEDURE
        # ---------------------------------------------------------------------
        # Loop across estimators
        # ---------------------------------------------------------------------
        for estimator_counter, (name, estimator) in enumerate(self.estimators_):
            if self.verbose > 0:
                estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__)
                print(estimator_str)

            # -----------------------------------------------------------------
            # Loop across folds
            # -----------------------------------------------------------------
            for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)):
                # Split data and target
                X_tr = X[tr_index]
                y_tr = y[tr_index]
                X_te = X[te_index]
                y_te = y[te_index]

                # Split sample weights accordingly (if passed)
                if sample_weight is not None:
                    sample_weight_tr = sample_weight[tr_index]
                    # sample_weight_te = sample_weight[te_index]
                else:
                    sample_weight_tr = None
                    # sample_weight_te = None

                # Fit estimator
                _ = self._estimator_action(self.models_A_[estimator_counter][fold_counter],
                                           X_tr, y_tr, None,
                                           sample_weight=sample_weight_tr,
                                           action='fit',
                                           transform=self.transform_target)

                # Predict out-of-fold part of train set
                if 'predict_proba' == self.action_:
                    col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_,
                                                estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_)
                else:
                    col_slice_estimator = estimator_counter
                S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter],
                                                                                None, None,
                                                                                X_te, action=self.action_,
                                                                                transform=self.transform_pred)
                # Compute score
                score = self.metric_(y_te, S_train[te_index, col_slice_estimator])
                self.scores_[estimator_counter, fold_counter] = score

                # Print fold score
                if self.verbose > 1:
                    fold_str = '    fold %2d:  [%.8f]' % (fold_counter, score)
                    print(fold_str)

            # Compute mean and std and save in dict
            estim_name = self.estimators_[estimator_counter][0]
            estim_mean = np.mean(self.scores_[estimator_counter])
            estim_std = np.std(self.scores_[estimator_counter])
            self.mean_std_.append((estim_name, estim_mean, estim_std))

            if self.verbose > 1:
                sep_str = '    ----'
                print(sep_str)

            # Compute mean + std (and full)
            if self.verbose > 0:
                mean_str = '    MEAN:     [%.8f] + [%.8f]\n' % (estim_mean, estim_std)
                print(mean_str)

            # Fit estimator on full train set
            if self.variant in ['B']:
                if self.verbose > 0:
                    print('    Fitting on full train set...\n')
                _ = self._estimator_action(self.models_B_[estimator_counter],
                                           X, y, None,
                                           sample_weight=sample_weight,
                                           action='fit',
                                           transform=self.transform_target)

        # ---------------------------------------------------------------------
        # ---------------------------------------------------------------------

        # Return fitted StackingTransformer instance
        return self
Beispiel #26
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features

    # Check if the base_estimator supports sample_weight
    base_estimator_ = ensemble.base_estimator_
    while (isinstance(base_estimator_, skPipeline)):  # for Pipelines
        base_estimator_ = base_estimator_._final_estimator
    support_sample_weight = has_fit_parameter(base_estimator_, "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []
    estimators_n_training_samples = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = seeds[i]
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      n_samples, max_features,
                                                      max_samples)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            estimator.fit((X[indices])[:, features],
                          y[indices],
                          sample_weight=curr_sample_weight[indices])

        else:
            estimator.fit((X[indices])[:, features], y[indices])

        if hasattr(estimator, 'n_training_samples_'):
            n_training_samples = getattr(estimator, 'n_training_samples_')
        else:
            n_training_samples = len(indices)

        estimators.append(estimator)
        estimators_features.append(features)
        estimators_n_training_samples.append(n_training_samples)

    return estimators, estimators_features, estimators_n_training_samples
Beispiel #27
0
    def partial_fit(self, X, Y, classes=None, sample_weight=None):
        """Incrementally fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            The input data.

        Y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets.

        classes : list of numpy arrays, shape (n_outputs)
            Each array contains the unique classes for one output. 
            Can be obtained by via ``[np.unique(Y[:, i]) for i in
            range(Y.shape[1])]``, 
            where ``Y`` is the target matrix of the entire dataset.
            This argument is required for the first call to partial_fit
            and can be omitted in the subsequent calls.
            Note that ``Y`` doesn't need to contain all labels in ``classes``.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If ``None``, then samples are equally weighted.
            Only supported if the underlying classifier supports sample
            weights.

        Returns
        -------
        self : object
        """
        self._validate_estimators()

        for _, est in self.estimators:
            if not hasattr(est, 'partial_fit'):
                raise AttributeError(
                    'Every base estimator should implement a partial_fit method.'
                )

        X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)

        if Y.ndim == 1:
            raise ValueError(
                'Output Y must have at least two dimensions for multi-output classification but has only one.'
            )

        if sample_weight is not None and any([
                not has_fit_parameter(clf, 'sample_weight')
                for _, clf in self.estimators
        ]):
            raise ValueError(
                'One of base estimators does not support sample weights.')

        first_time = not hasattr(self, 'estimators_')

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_estimator)
            (self.estimators_[i] if not first_time else clf, X, Y[:, i],
             classes[i] if classes is not None else None, sample_weight,
             first_time)
            for i, (_, clf) in zip(range(Y.shape[1]), self.estimators))

        return self
Beispiel #28
0
 def _check_sample_weight(self):
     if not has_fit_parameter(self.base_estimator_, "sample_weight"):
         raise ValueError("%s doesn't support sample_weight."
                          % self.base_estimator_.__class__.__name__)
Beispiel #29
0
def _parallel_build_estimators(n_estimators,
                               ensemble,
                               X,
                               y,
                               sample_weight,
                               seeds_features,
                               seeds_samples,
                               seeds_max_features,
                               total_n_estimators,
                               verbose,
                               start_index,
                               draw_max_features=False,
                               circular_features=False):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []
    estimators_samples = []
    estimators_splits = []

    for i in range(n_estimators):
        if verbose > 2:
            print(
                "Building estimator %d of %d for this parallel run (total %d)..."
                % (i + 1, n_estimators, total_n_estimators))

        random_state_max_features = np.random.RandomState(
            seeds_max_features[i])
        random_state_features = np.random.RandomState(seeds_features[i])
        random_state = np.random.RandomState(seeds_samples[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        if circular_features:
            n_features_window = ensemble.window_size
            max_features_window = max_features
        else:
            n_features_window = min(ensemble.window_size,
                                    n_features - start_index[i])
            max_features_window = min(max_features,
                                      n_features - start_index[i])

        features, indices = _generate_bagging_indices(
            random_state_features,
            random_state,
            random_state_max_features,
            bootstrap_features,
            bootstrap,
            n_features_window,
            n_samples,
            max_features_window,
            max_samples,
            draw_max_features=draw_max_features)

        features += start_index[i]

        # ensure not going outside range, take the first ones instead
        np.mod(features, n_features, out=features)

        # Draw samples, using sample weights, and then fit
        y_binary = random_binarizer(y)
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 0

            estimator.fit(X[:, features],
                          y_binary,
                          sample_weight=curr_sample_weight)

        # Draw samples, using a mask, and then fit
        else:
            estimator.fit((X[indices])[:, features], y_binary[indices])

        estimators.append(estimator)
        estimators_features.append(features)
        estimators_samples.append(indices)
        estimators_splits.append(y_binary)

    return estimators, estimators_features, estimators_samples, estimators_splits
Beispiel #30
0
	def fit(self, X, y, sample_weight=None):
		""" Fit the estimators.

		Parameters
		----------
		X : {array-like, sparse matrix}, shape = [n_samples, n_features] or 
			list of {array-like, sparse matrix} with shape = [n_samples, n_features], of length len(self.estimators).
			Training vectors, where n_samples is the number of samples and
			n_features is the number of features.

		y : array-like, shape = [n_samples] or [n_samples,n_labels]
			Target values.

		sample_weight : array-like, shape = [n_samples] or None
			Sample weights. If None, then samples are equally weighted.
			Note that this is supported only if all underlying estimators
			support sample weights.

		Returns
		-------
		self : object
		"""
		if len(X) == 0:
			raise ValueError("X must contain at least one entry; got (X=%r)" % X)
		if self.voting not in ('soft', 'hard'):
			raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
							 % self.voting)

		if self.estimators is None or len(self.estimators) == 0:
			raise AttributeError('Invalid `estimators` attribute, `estimators`'
								 ' should be a list of (string, estimator)'
								 ' tuples')

		if self.weights and len(self.weights) != len(self.estimators):
			raise ValueError('Number of classifiers and weights must be equal'
							 '; got %d weights, %d estimators'
							 % (len(self.weights), len(self.estimators)))

		if sample_weight is not None:
			for name, step in self.estimators:
				if not has_fit_parameter(step, 'sample_weight'):
					raise ValueError('Underlying estimator \'%s\' does not support'
									 ' sample weights.' % name)
		
		if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
			self.multilabel_ = True
			self.le_ = MultiLabelBinarizer()
			self.le_.fit([range(y.shape[1])])
		else:
			self.multilabel_ = False
			self.le_ = LabelEncoder()
			self.le_.fit(y)
			
		self.classes_ = self.le_.classes_
		self.estimators_ = []

		transformed_y = self.le_.transform(y)
		# Check whether we have X.shape = [n_samples,n_features] or X = [[n_samples,n_features_1],...,[n_samples,n_features_k]]
		measure = X[0]
		if not isinstance(measure, np.ndarray):
			measure = np.array(measure)
		
		self.multiple_features_ = len(measure.shape) == 2
		
		if self.multiple_features_ and len(X) != len(self.estimators):
			raise ValueError("For voters requiring different data, X must be a list of"
							"data arrays, with the same length as the number of voters. Got X of length %s" % len(X))
		
		if self.multiple_features_ and not isinstance(X,list):
			raise ValueError("For voters requiring different data, X must be a list of"
							"data arrays, with the same length as the number of voters. Got %s " % type(X))
		
		if self.multiple_features_:
			self.estimators_ = Parallel(n_jobs=self.n_jobs)(
					delayed(_parallel_fit_estimator)(clone(clf), XX, transformed_y,
						sample_weight)
						for XX, _, clf in zip(X,*zip(*self.estimators)))
		
		else:
			self.estimators_ = Parallel(n_jobs=self.n_jobs)(
					delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
						sample_weight)
						for _, clf in self.estimators)

		return self
Beispiel #31
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

        Returns
        -------
        self : object
        """
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if self.selectors is None or len(self.selectors) == 0:
            raise AttributeError('Invalid `selectors` attribute, `selectors`'
                                 ' should be a list of (string, np.array)'
                                 ' tuples')

        if len(self.selectors) != len(self.estimators):
            raise ValueError('Number of selectors and estimators must be equal'
                             '; got %d selectors, %d estimators'
                             % (len(self.selectors), len(self.estimators)))

        if not isinstance(self.weights, type(None)) and len(self.weights) != len(self.estimators):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError('Underlying estimator \'%s\' does not support'
                                     ' sample weights.' % name)

        self.le_ = LabelEncoder()
        self.le_.fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X[:, self.named_selectors[name]], transformed_y,
                    sample_weight)
                    for name, clf in self.estimators)

        return self
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose):
    """Private function used to build a batch of estimators within a job.
    Now it supports queries and querywise sampling.
    It also breaks the PEP8 line length constraint now"""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features
    uQueries = np.unique(Q)

    sample_whole_queries = False
    if hasattr(ensemble, "sample_whole_queries"):
        sample_whole_queries = ensemble.sample_whole_queries

    if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0):
        if sample_whole_queries:
            max_samples = int(max_samples * len(uQueries))
        else:
            max_samples = int(max_samples * n_samples)

    if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap

    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features, max_features, random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                if sample_whole_queries:
                    notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)]
                    notQindices.sort()
                    not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices])
                else:
                    not_indices = sample_without_replacement(
                        n_samples, n_samples - max_samples, random_state=random_state
                    )

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.0

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
            else:
                if sample_whole_queries:
                    Qindices = uQueries[
                        sample_without_replacement(len(uQueries), max_samples, random_state=random_state)
                    ]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = sample_without_replacement(n_samples, max_samples, random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices])
            samples = sample_counts > 0.0

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
    def fit(self, X, y, sample_weight=None):
        """ Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

        Returns
        -------
        self : object
        """
        # if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
        #     raise NotImplementedError('Multilabel and multi-output'
        #                               ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError(
                "Voting must be 'soft' or 'hard'; got (voting=%r)" %
                self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if self.weights and len(self.weights) != len(self.estimators):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators' %
                             (len(self.weights), len(self.estimators)))

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError(
                        'Underlying estimator \'%s\' does not support'
                        ' sample weights.' % name)

        # self.le_ = LabelEncoder()
        # self.le_.fit(y)
        # self.classes_ = self.le_.classes_
        self.estimators_ = []

        # transformed_y = self.le_.transform(y)
        transformed_y = y

        # self.estimators_ = Parallel(n_jobs=self.n_jobs)(
        #         delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
        #             sample_weight)
        #             for _, clf in self.estimators)

        for name, clf in self.estimators:
            self.estimators_.append(clone(clf))

        for clf in self.estimators_:
            clf.fit(X, transformed_y)

        # self.estimators_ = Parallel(n_jobs=self.n_jobs)(
        #         delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
        #             sample_weight)
        #             for _, clf in self.estimators)

        return self
Beispiel #34
0
def ransac_fit_with_weights(self, X, y, sample_weight=None, residual_threshold=None):
    """
    Modified sklearn.linear_model.RANSACRegressor.fit()
    sample_weight is used in sampling base points, fitting the regressor, and calculating score for candidate model
    """
    X = check_array(X, accept_sparse='csr')
    y = check_array(y, ensure_2d=False)
    check_consistent_length(X, y)

    if self.base_estimator is not None:
        base_estimator = clone(self.base_estimator)
    else:
        base_estimator = LinearRegression()

    if self.min_samples is None:
        # assume linear model by default
        min_samples = X.shape[1] + 1
    elif 0 < self.min_samples < 1:
        min_samples = np.ceil(self.min_samples * X.shape[0])
    elif self.min_samples >= 1:
        if self.min_samples % 1 != 0:
            raise ValueError("Absolute number of samples must be an "
                             "integer value.")
        min_samples = self.min_samples
    else:
        raise ValueError("Value for `min_samples` must be scalar and "
                         "positive.")
    if min_samples > X.shape[0]:
        raise ValueError("`min_samples` may not be larger than number "
                         "of samples: n_samples = %d." % (X.shape[0]))

    if self.stop_probability < 0 or self.stop_probability > 1:
        raise ValueError("`stop_probability` must be in range [0, 1].")

    if residual_threshold is None:
        if self.residual_threshold is None:
            # MAD (median absolute deviation)
            residual_threshold = np.median(np.abs(y - np.median(y)))
        else:
            residual_threshold = self.residual_threshold

    if self.loss == "absolute_loss":
        if y.ndim == 1:
            loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
        else:
            loss_function = lambda \
                y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)

    elif self.loss == "squared_loss":
        if y.ndim == 1:
            loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
        else:
            loss_function = lambda \
                y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)

    elif callable(self.loss):
        loss_function = self.loss

    else:
        raise ValueError(
            "loss should be 'absolute_loss', 'squared_loss' or a callable."
            "Got %s. " % self.loss)


    random_state = check_random_state(self.random_state)

    try:  # Not all estimator accept a random_state
        base_estimator.set_params(random_state=random_state)
    except ValueError:
        pass

    estimator_fit_has_sample_weight = has_fit_parameter(base_estimator,
                                                        "sample_weight")
    estimator_name = type(base_estimator).__name__
    if (sample_weight is not None and not
            estimator_fit_has_sample_weight):
        raise ValueError("%s does not support sample_weight. Samples"
                         " weights are only used for the calibration"
                         " itself." % estimator_name)
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)

    n_inliers_best = 1
    score_best = -np.inf
    inlier_mask_best = None
    X_inlier_best = None
    y_inlier_best = None
    weight_inlier_best = None
    self.n_skips_no_inliers_ = 0
    self.n_skips_invalid_data_ = 0
    self.n_skips_invalid_model_ = 0

    # number of data samples
    n_samples = X.shape[0]
    sample_idxs = np.arange(n_samples)

    n_samples, _ = X.shape

    self.n_trials_ = 0
    max_trials = self.max_trials
    while self.n_trials_ < max_trials:
        self.n_trials_ += 1

        if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                self.n_skips_invalid_model_) > self.max_skips:
            break

        # choose random sample set
        #subset_idxs = sample_without_replacement(n_samples, min_samples,
        #                                         random_state=random_state)
        
        # use np.random.choice here since it allows sample with prob
        subset_idxs = np.random.choice(n_samples, min_samples, False, sample_weight / np.sum(sample_weight))
        X_subset = X[subset_idxs]
        y_subset = y[subset_idxs]

        # check if random sample set is valid
        if (self.is_data_valid is not None
                and not self.is_data_valid(X_subset, y_subset)):
            self.n_skips_invalid_data_ += 1
            continue

        # fit model for current random sample set
        if sample_weight is None:
            base_estimator.fit(X_subset, y_subset)
        else:
            base_estimator.fit(X_subset, y_subset,
                               sample_weight=sample_weight[subset_idxs])

        # check if estimated model is valid
        if (self.is_model_valid is not None and not
                self.is_model_valid(base_estimator, X_subset, y_subset)):
            self.n_skips_invalid_model_ += 1
            continue

        # residuals of all data for current random sample model
        y_pred = base_estimator.predict(X)
        residuals_subset = loss_function(y, y_pred)

        # classify data into inliers and outliers
        inlier_mask_subset = residuals_subset < residual_threshold
        n_inliers_subset = np.sum(inlier_mask_subset)

        # less inliers -> skip current random sample
        if n_inliers_subset < n_inliers_best:
            self.n_skips_no_inliers_ += 1
            continue

        # extract inlier data set
        inlier_idxs_subset = sample_idxs[inlier_mask_subset]
        X_inlier_subset = X[inlier_idxs_subset]
        y_inlier_subset = y[inlier_idxs_subset]
        if sample_weight is None:
            weight_inlier_subset = None
        else:
            weight_inlier_subset = sample_weight[inlier_idxs_subset]

        # score of inlier data set
        score_subset = base_estimator.score(X_inlier_subset,
                                            y_inlier_subset,
                                            sample_weight[inlier_idxs_subset])

        # same number of inliers but worse score -> skip current random
        # sample
        if (n_inliers_subset == n_inliers_best
                and score_subset < score_best):
            continue

        # save current random sample as best sample
        n_inliers_best = n_inliers_subset
        score_best = score_subset
        inlier_mask_best = inlier_mask_subset
        X_inlier_best = X_inlier_subset
        y_inlier_best = y_inlier_subset
        weight_inlier_best = weight_inlier_subset

        max_trials = min(
            max_trials,
            _dynamic_max_trials(n_inliers_best, n_samples,
                                min_samples, self.stop_probability))

        # break if sufficient number of inliers or score is reached
        if n_inliers_best >= self.stop_n_inliers or \
                        score_best >= self.stop_score:
            break

    # if none of the iterations met the required criteria
    if inlier_mask_best is None:
        if ((self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                self.n_skips_invalid_model_) > self.max_skips):
            raise ValueError(
                "RANSAC skipped more iterations than `max_skips` without"
                " finding a valid consensus set. Iterations were skipped"
                " because each randomly chosen sub-sample failed the"
                " passing criteria. See estimator attributes for"
                " diagnostics (n_skips*).")
        else:
            raise ValueError(
                "RANSAC could not find a valid consensus set. All"
                " `max_trials` iterations were skipped because each"
                " randomly chosen sub-sample failed the passing criteria."
                " See estimator attributes for diagnostics (n_skips*).")
    else:
        if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                self.n_skips_invalid_model_) > self.max_skips:
            warnings.warn("RANSAC found a valid consensus set but exited"
                          " early due to skipping more iterations than"
                          " `max_skips`. See estimator attributes for"
                          " diagnostics (n_skips*).",
                          ConvergenceWarning)

    # estimate final model using all inliers
    base_estimator.fit(X_inlier_best, y_inlier_best, weight_inlier_best)

    self.estimator_ = base_estimator
    self.inlier_mask_ = inlier_mask_best
    return self
Beispiel #35
0
    def fitSegm(self, X, y, segmList, sample_weight=None):

        merged = list(itertools.chain.from_iterable(segmList))

        X = check_array(X, accept_sparse='csr')
        y = check_array(y, ensure_2d=False)
        check_consistent_length(X, y)

        if self.base_estimator is not None:
            base_estimator = clone(self.base_estimator)
        else:
            base_estimator = LinearRegression()

        if self.min_samples is None:
            # assume linear model by default
            min_samples = X.shape[1] + 1 if len(segmList) < 2 else X.shape[
                1]  #MINIMUM SAMPLES
        elif 0 < self.min_samples < 1:
            min_samples = np.ceil(self.min_samples * X.shape[0])
        elif self.min_samples >= 1:
            if self.min_samples % 1 != 0:
                raise ValueError("Absolute number of samples must be an "
                                 "integer value.")
            min_samples = self.min_samples
        else:
            raise ValueError("Value for `min_samples` must be scalar and "
                             "positive.")
        if min_samples > X.shape[0]:
            raise ValueError("`min_samples` may not be larger than number "
                             "of samples: n_samples = %d." % (X.shape[0]))

        if self.stop_probability < 0 or self.stop_probability > 1:
            raise ValueError("`stop_probability` must be in range [0, 1].")

        if self.residual_threshold is None:
            # MAD (median absolute deviation)
            residual_threshold = np.percentile(
                np.abs(y - np.percentile(y, pervar)), pervar)
        else:
            residual_threshold = self.residual_threshold

        if self.loss == "absolute_loss":
            if y.ndim == 1:
                loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
            else:
                loss_function = lambda \
                    y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)

        elif self.loss == "squared_loss":
            if y.ndim == 1:
                loss_function = lambda y_true, y_pred: (y_true - y_pred)**2
            else:
                loss_function = lambda \
                    y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)

        elif callable(self.loss):
            loss_function = self.loss

        else:
            raise ValueError(
                "loss should be 'absolute_loss', 'squared_loss' or a callable."
                "Got %s. " % self.loss)

        random_state = check_random_state(self.random_state)

        try:  # Not all estimator accept a random_state
            base_estimator.set_params(random_state=random_state)
        except ValueError:
            pass

        estimator_fit_has_sample_weight = has_fit_parameter(
            base_estimator, "sample_weight")
        estimator_name = type(base_estimator).__name__
        if (sample_weight is not None and not estimator_fit_has_sample_weight):
            raise ValueError("%s does not support sample_weight. Samples"
                             " weights are only used for the calibration"
                             " itself." % estimator_name)
        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)

        n_inliers_best = 1
        score_best = -np.inf
        inlier_mask_best = None
        X_inlier_best = None
        y_inlier_best = None
        aicc_ = None
        self.n_skips_no_inliers_ = 0
        self.n_skips_invalid_data_ = 0
        self.n_skips_invalid_model_ = 0

        # Generate a list of indices for each segment
        size_sl = [len(s) - 1 for s in segmList]
        n_segments = len(size_sl)

        # number of data samples
        n_samples = X.shape[0]
        sample_idxs = np.arange(n_samples)

        n_samples, _ = X.shape

        self.n_trials_ = 0
        max_trials = self.max_trials

        while self.n_trials_ < max_trials:
            self.n_trials_ += 1
            if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                    self.n_skips_invalid_model_) > self.max_skips:
                break

            # choose random sample set
            ## antes:
            ## subset_idxs = sample_without_replacement(n_samples, min_samples, random_state=random_state)

            ## ahora:
            subset_idx_entries = sample_without_replacement(
                n_segments, min_samples, random_state=random_state)



            subset_idxs = np.asarray([segmList[ss][random.randint(0, size_sl[ss])] \
                           for ss in subset_idx_entries])

            X_subset = X[subset_idxs]
            y_subset = y[subset_idxs]

            # check if random sample set is valid
            if (self.is_data_valid is not None
                    and not self.is_data_valid(X_subset, y_subset)):
                self.n_skips_invalid_data_ += 1
                continue

            # fit model for current random sample set
            if sample_weight is None:
                base_estimator.fit(X_subset, y_subset)
            else:
                base_estimator.fit(X_subset,
                                   y_subset,
                                   sample_weight=sample_weight[subset_idxs])

            # check if estimated model is valid
            if (self.is_model_valid is not None and not self.is_model_valid(
                    base_estimator, X_subset, y_subset)):
                self.n_skips_invalid_model_ += 1
                continue

            # check if estimated model is valid (ii)
            y_pred_subset = base_estimator.predict(X_subset)
            residuals_ii = loss_function(y_subset, y_pred_subset)
            inlier_mask_subset_ii = residuals_ii < residual_threshold

            if np.sum(inlier_mask_subset_ii) < X.shape[1]:
                self.n_skips_invalid_model_ += 1
                continue

            ########################## Inlier evaluation

            # residuals of all data for current random sample model
            y_pred = base_estimator.predict(X[merged])
            residuals_subset = loss_function(y[merged], y_pred)

            # classify data into inliers and outliers
            inlier_mask_subset = residuals_subset < residual_threshold
            n_inliers_subset = np.sum(inlier_mask_subset)
            if False:
                print(
                    f"n_inliers_subset {n_inliers_subset} from {inlier_mask_subset.shape}"
                )

            # extract inlier data set
            inlier_idxs_subset = list(compress(merged, inlier_mask_subset))

            X_inlier_subset = X[inlier_idxs_subset]
            y_inlier_subset = y[inlier_idxs_subset]

            if (False):  # plain evaluation (basic approach)

                #check that the all points in sample are inliers
                if n_inliers_subset < min_samples:
                    continue

                # less inliers -> skip current random sample
                if n_inliers_subset < n_inliers_best:
                    self.n_skips_no_inliers_ += 1
                    continue

                # score of inlier data set
                score_subset = base_estimator.score(X_inlier_subset,
                                                    y_inlier_subset)

                # same number of inliers but worse score -> skip current random
                # sample
                if (n_inliers_subset == n_inliers_best
                        and score_subset <= score_best):
                    continue

            else:  #evaluation for each calibration point

                indScore = 0  # score that considers inliers of each calibration point
                cc = 0
                for sSeg, seg in zip(size_sl, segmList):

                    c_seg = range(cc, sSeg + cc)
                    #print(seg)
                    #sys.exit(0)
                    cc += sSeg

                    # classify data into inliers and outliers
                    nScore = np.sum(inlier_mask_subset[c_seg])
                    n_in_subset = nScore

                    indScore += poisson.cdf(nScore, 0.3 * sSeg)

                if (indScore <= score_best):
                    continue
                score_subset = indScore

            # save current random sample as best sample
            n_inliers_best = n_inliers_subset
            score_best = score_subset
            inlier_mask_best = inlier_mask_subset
            X_inlier_best = X_inlier_subset
            y_inlier_best = y_inlier_subset

            max_trials = min(
                max_trials,
                _dynamic_max_trials(n_inliers_best, n_samples, min_samples,
                                    self.stop_probability))

            # break if sufficient number of inliers or score is reached
            if n_inliers_best >= self.stop_n_inliers or \
                            score_best >= self.stop_score:
                break

    # if none of the iterations met the required criteria
        if inlier_mask_best is None:
            base_estimator.coef_ = -999
            if ((self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                 self.n_skips_invalid_model_) > self.max_skips):
                raise ValueError(
                    "RANSAC skipped more iterations than `max_skips` without"
                    " finding a valid consensus set. Iterations were skipped"
                    " because each randomly chosen sub-sample failed the"
                    " passing criteria. See estimator attributes for"
                    " diagnostics (n_skips*).")
            else:
                raise ValueError(
                    "RANSAC could not find a valid consensus set. All"
                    " `max_trials` iterations were skipped because each"
                    " randomly chosen sub-sample failed the passing criteria."
                    " See estimator attributes for diagnostics (n_skips*).")
        else:
            if (self.n_skips_no_inliers_ + self.n_skips_invalid_data_ +
                    self.n_skips_invalid_model_) > self.max_skips:
                warnings.warn(
                    "RANSAC found a valid consensus set but exited"
                    " early due to skipping more iterations than"
                    " `max_skips`. See estimator attributes for"
                    " diagnostics (n_skips*).", ConvergenceWarning)
        # estimate final model using all inliers
            base_estimator.fit(X_inlier_best, y_inlier_best)
            self.estimator_ = base_estimator
            self.inlier_mask_ = inlier_mask_best
            return self
Beispiel #36
0
 def _check_sample_weight(self):
     if not has_fit_parameter(self.base_estimator_, "sample_weight"):
         raise ValueError("%s doesn't support sample_weight."
                          % self.base_estimator_.__class__.__name__)
Beispiel #37
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    sampler = ensemble.sampler
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      n_samples, max_features,
                                                      max_samples)

        # Resample each bag
        if sampler == 'under':
            X_res, y_res = RandomUnderSampler(
                random_state=random_state).fit_sample(X, y)
        elif sampler == 'over':
            X_res, y_res = RandomOverSampler(
                random_state=random_state).fit_sample(X, y)
        elif sampler == 'smote':
            X_res, y_res = SMOTE(random_state=random_state,
                                 k_neighbors=5).fit_sample(X, y)


#        elif sampler == 'adasyn':
#            X_res, y_res = ADASYN(random_state=random_state, n_neighbors=5,
#                                             ).fit_sample(X, y)
        else:
            X_res, y_res = X, y

        if bootstrap:
            estimator.fit((X[indices])[:, features], y[indices])
        else:
            estimator.fit(X_res[:, features], y_res)

        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
Beispiel #38
0
    def fit(self, X, Y, sample_weight=None):
        """ Fit the estimators.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.
        Returns
        -------
        self : object
        """
        X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True)

        if self.voting not in ('soft', 'hard'):
            raise ValueError(
                "Voting must be 'soft' or 'hard'; got (voting=%r)" %
                self.voting)

        if (self.weights is not None and len(self.weights) != self.n_chains):
            raise ValueError("Number of classifiers and weights must be equal"
                             '; got %d weights, %d chains' %
                             (len(self.weights), len(self.n_chains)))

        if sample_weight is not None:
            if not has_fit_parameter(self.base_estimator, 'sample_weight'):
                raise ValueError("Underlying estimator \'%s\' does not"
                                 " support sample weights." %
                                 self.base_estimator.__class__.__name__)

        random_state = check_random_state(self.random_state)

        if self.orders is not None:
            if np.asarray(self.orders).shape != (self.n_chains, Y.shape[1]):
                raise ValueError("Argument orders must have shape " +
                                 "(n_chains, n_outputs); expected {}, " +
                                 "but got {}.".format(
                                     (self.n_chains,
                                      Y.shape[1]), self.orders.shape))
            else:
                self.orders_ = self.orders
        else:
            self.orders_ = [
                random_state.permutation(Y.shape[1])
                for _ in range(self.n_chains)
            ]

        self.le_ = []
        self.classes_ = []
        for y in Y.T:
            le = LabelEncoder().fit(y)
            self.le_.append(le)
            self.classes_.append(le.classes_)

        self.chains_ = [
            ClassifierChain(self.base_estimator, order=order, cv=self.cv)
            for order in self.orders_
        ]
        self.chains_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_fit_estimator)(clone(cc), X, Y, \
                sample_weight) for cc in self.chains_)
def test_has_fit_parameter():
    assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight"))
    assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight"))
    assert_true(has_fit_parameter(SVR, "sample_weight"))
    assert_true(has_fit_parameter(SVR(), "sample_weight"))
Beispiel #40
0
def _spark_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    print "building estimators"
    # Retrieve settings
    X = X.value
    y = y.value
    ensemble = ensemble
    sample_weight = sample_weight.value

    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features