Ejemplo n.º 1
0
    def stack_predict(self, df, holdout, pipes, amount=2):
        X, y = self.split_x_y(df)
        X_test, y_test = self.split_x_y(holdout)

        pipe = Pipeline(self.top_pipeline(pipes).steps[:-1])
        X = pipe.fit_transform(X)
        X_test = pipe.transform(X_test)

        estimators = []

        for i in range(amount):
            estimators.append((str(i), self.top_pipeline(pipes,
                                                         i).steps[-1][1]))

        regression = False

        if self.METRIC in [
                "explained_variance",
                "neg_mean_absolute_error",
                "neg_mean_squared_error",
                "neg_mean_squared_log_error",
                "neg_median_absolute_error",
                "r2",
        ]:
            regression = True

        stack = StackingTransformer(estimators, regression)
        stack.fit(X, y)

        S_train = stack.transform(X)
        S_test = stack.transform(X_test)

        final_estimator = estimators[0][1]
        final_estimator.fit(S_train, y)

        return final_estimator, y_test, final_estimator.predict(S_test)
Ejemplo n.º 2
0
class ClusterOverSampler(BaseOverSampler):
    """A class that handles clustering-based over-sampling.

    Any combination of over-sampler, clusterer and distributor can
    be used.

    Read more in the :ref:`user guide <user_guide>`.

    Parameters
    ----------
    oversampler : oversampler estimator, default=None
        Over-sampler to apply to each selected cluster.

    clusterer : clusterer estimator, default=None
        Clusterer to apply to input space before over-sampling.

        - When ``None``, it corresponds to a clusterer that assigns
          a single cluster to all the samples i.e. no clustering is applied.

        - When clusterer, it applies clustering to the input space. Then
          over-sampling is applied inside each cluster and between clusters.

    distributor : distributor estimator, default=None
        Distributor to distribute the generated samples per cluster label.

        - When ``None`` and a clusterer is provided then it corresponds to the
          density distributor. If clusterer is also ``None`` than the distributor
          does not affect the over-sampling procedure.

        - When distributor object is provided, it is used to distribute the
          generated samples to the clusters.

    raise_error : bool, default=True
        Raise an error when no samples are generated.

        - If ``True``, it raises an error when no filtered clusters are
          identified and therefore no samples are generated.

        - If ``False``, it displays a warning.

    {random_state}

    {n_jobs}

    Attributes
    ----------
    clusterer_ : object
        A fitted clone of the ``clusterer`` parameter or ``None`` when a
        clusterer is not given.

    distributor_ : object
        A fitted clone of the ``clusterer`` parameter or a fitted instance of
        the ``BaseDistributor`` when a distributor is not given.

    labels_ : array, shape (n_samples,)
        Labels of each sample.

    neighbors_ : array, (n_neighboring_pairs, 2) or None
        An array that contains all neighboring pairs with each row being
        a unique neighboring pair. It is ``None`` when the clusterer does not
        support this attribute.

    oversampler_ : object
        A fitted clone of the ``oversampler`` parameter.

    random_state_ : object
        An instance of ``RandomState`` class.

    sampling_strategy_ : dict
        Actual sampling strategy.

    Examples
    --------
    >>> from collections import Counter
    >>> from clover.over_sampling import ClusterOverSampler
    >>> from sklearn.datasets import make_classification
    >>> from sklearn.cluster import KMeans
    >>> from imblearn.over_sampling import SMOTE
    >>> X, y = make_classification(random_state=0, n_classes=2, weights=[0.9, 0.1])
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{0: 90, 1: 10}})
    >>> cluster_oversampler = ClusterOverSampler(
    ... oversampler=SMOTE(random_state=5),
    ... clusterer=KMeans(random_state=10))
    >>> X_res, y_res = cluster_oversampler.fit_resample(X, y)
    >>> print('Resampled dataset shape %s' % Counter(y_res))
    Resampled dataset shape Counter({{0: 90, 1: 90}})
    """

    def __init__(
        self,
        oversampler,
        clusterer=None,
        distributor=None,
        raise_error=True,
        random_state=None,
        n_jobs=None,
    ):
        self.oversampler = oversampler
        self.clusterer = clusterer
        self.distributor = distributor
        self.raise_error = raise_error
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Check inputs and statistics of the sampler.

        You should use ``fit_resample`` in all cases.

        Parameters
        ----------
        X : {array-like, dataframe, sparse matrix} of shape \
                (n_samples, n_features)
            Data array.
        y : array-like of shape (n_samples,)
            Target array.

        Returns
        -------
        self : object
            Return the instance itself.
        """
        X, y, _ = self._check_X_y(X, y)
        self._check(X, y)
        return self

    def fit_resample(self, X, y, **fit_params):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, dataframe, sparse matrix} of shape \
                (n_samples, n_features)
            Matrix containing the data which have to be sampled.
        y : array-like of shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {array-like, dataframe, sparse matrix} of shape \
                (n_samples_new, n_features)
            The array containing the resampled data.
        y_resampled : array-like of shape (n_samples_new,)
            The corresponding label of `X_resampled`.
        """
        check_classification_targets(y)
        arrays_transformer = ArraysTransformer(X, y)
        X, y, binarize_y = self._check_X_y(X, y)

        self._check(X, y)._fit(X, y, **fit_params)

        output = self._fit_resample(X, y)

        y_ = (
            label_binarize(y=output[1], classes=np.unique(y))
            if binarize_y
            else output[1]
        )

        X_, y_ = arrays_transformer.transform(output[0], y_)
        return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

    def _cluster_sample(self, clusters_data, X, y):
        """Generate artificial data inside clusters or between clusters."""
        generated_data = Parallel(n_jobs=self.n_jobs)(
            delayed(_generate_in_cluster)(self.oversampler_, self.transformer_, *data)
            for data in clusters_data
        )
        if generated_data:
            return [np.concatenate(data) for data in zip(*generated_data)]
        else:
            return None, None

    def _intra_sample(self, X, y):
        """Intracluster resampling."""
        clusters_data = _extract_intra_data(
            X,
            y,
            self.labels_,
            self.distributor_.intra_distribution_,
            self.sampling_strategy_,
        )
        return self._cluster_sample(clusters_data, X, y)

    def _inter_sample(self, X, y):
        """Intercluster resampling."""
        clusters_data = _extract_inter_data(
            X,
            y,
            self.labels_,
            self.distributor_.inter_distribution_,
            self.sampling_strategy_,
            self.random_state_,
        )
        return self._cluster_sample(clusters_data, X, y)

    def _check_estimators(self, X, y):
        """Check various estimators."""

        # Check transformer and oversampler
        if isinstance(self.oversampler, Pipeline):
            if self.oversampler.steps[:-1]:
                self.transformer_ = Pipeline(self.oversampler.steps[:-1]).fit(X)
            self.oversampler_ = clone(self.oversampler.steps[-1][-1])
        else:
            self.oversampler_ = clone(self.oversampler)

        # Check clusterer and distributor
        if self.clusterer is None and self.distributor is not None:
            raise ValueError(
                'Distributor was found but clusterer is set to `None`. '
                'Either set parameter `distributor` to `None` or use a clusterer.'
            )
        elif self.clusterer is None and self.distributor is None:
            self.clusterer_ = None
            self.distributor_ = BaseDistributor()
        else:
            self.clusterer_ = clone(self.clusterer)
            self.distributor_ = (
                DensityDistributor()
                if self.distributor is None
                else clone(self.distributor)
            )
        return self

    def _check_sampling_strategy(self, y):
        """Check sampling strategy."""
        self.sampling_strategy_ = check_sampling_strategy(
            self.oversampler_.sampling_strategy,
            y,
            self._sampling_type,
        )
        return self

    def _check(self, X, y):
        """Apply various checks."""

        # Check random state
        self.random_state_ = check_random_state(self.random_state)

        # Check transformer
        self.transformer_ = None

        # Check estimators and sampling strategy
        self._check_estimators(X, y)._check_sampling_strategy(y)

        return self

    def _fit(self, X, y, **fit_params):
        """Fit the clusterer and distributor."""

        # Fit clusterer
        if self.clusterer_ is not None:
            self.clusterer_.fit(X, y, **fit_params)

        # Extract labels and neighbors
        self.labels_ = getattr(self.clusterer_, 'labels_', np.zeros(len(X), dtype=int))
        self.neighbors_ = getattr(self.clusterer_, 'neighbors_', None)

        # fit distributor
        self.distributor_.fit(X, y, labels=self.labels_, neighbors=self.neighbors_)

        # Case when no samples are generated
        if (
            not self.distributor_.intra_distribution_
            and not self.distributor_.inter_distribution_
        ):
            msg = (
                'No samples were generated. Try to modify the parameters '
                'of the clusterer or distributor.'
            )

            # Raise error
            if self.raise_error:
                raise ValueError(msg)

            # Display warning
            else:
                warnings.warn(msg, FitFailedWarning)

        return self

    def _fit_resample(self, X, y, **fit_params):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`
        """

        # Intracluster oversampling
        X_intra_new, y_intra_new = self._intra_sample(X, y)

        # Intercluster oversampling
        X_inter_new, y_inter_new = self._inter_sample(X, y)

        # Set sampling strategy
        intra_count, inter_count = Counter(y_intra_new), Counter(y_inter_new)
        self.sampling_strategy_ = OrderedDict({})
        for class_label in set(intra_count.keys()).union(inter_count.keys()):
            self.sampling_strategy_[class_label] = intra_count.get(
                class_label, 0
            ) + inter_count.get(class_label, 0)

        # Stack resampled data
        X_resampled = [
            self.transformer_.transform(X) if self.transformer_ is not None else X,
            X_intra_new,
            X_inter_new,
        ]
        y_resampled = [y, y_intra_new, y_inter_new]
        X_resampled, y_resampled = (
            np.vstack([X for X in X_resampled if X is not None]),
            np.hstack([y for y in y_resampled if y is not None]),
        )

        return X_resampled, y_resampled