Example #1
0
def test_frienemy_not_all_classifiers_crosses(example_estimate_competence):
    expected = np.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    _, y, neighbors, _, dsel_processed, _ = example_estimate_competence

    # passing three samples to compute the DFP at the same time
    result = frienemy_pruning_preprocessed(neighbors[:, :3], y, dsel_processed)
    assert np.array_equal(result, expected)
Example #2
0
 def _get_DFP_mask(self, neighbors):
     if self.DFP:
         DFP_mask = frienemy_pruning_preprocessed(neighbors,
                                                  self.DSEL_target_,
                                                  self.DSEL_processed_)
     else:
         DFP_mask = np.ones((neighbors.shape[0], self.n_classifiers_))
     return DFP_mask
Example #3
0
def test_DFP_is_used(example_estimate_competence, create_pool_classifiers):
    X, y, neighbors, _, dsel_processed, _ = example_estimate_competence
    safe_k = 3
    ds_test = BaseDS(create_pool_classifiers, DFP=True, safe_k=safe_k)
    ds_test.fit(X, y)
    ds_test.DSEL_processed_ = dsel_processed

    DFP_mask = frienemy_pruning_preprocessed(neighbors[0, :safe_k], y,
                                             dsel_processed)
    assert np.array_equal(DFP_mask, np.atleast_2d([1, 1, 0]))
Example #4
0
def test_frienemy_no_classifier_crosses(example_estimate_competence):
    _, y, neighbors = example_estimate_competence[0:3]
    n_classifiers = 3
    predictions = np.zeros((y.size, n_classifiers))
    mask = frienemy_pruning_preprocessed(neighbors, y, predictions)
    assert mask.all()
Example #5
0
def test_frienemy_safe_region(example_estimate_competence):
    X, y, _, _, dsel_processed, _ = example_estimate_competence
    neighbors = np.tile(np.array([0, 1, 2, 6, 7, 8, 14]), (10, 1))

    result = frienemy_pruning_preprocessed(neighbors, y, dsel_processed)
    assert result.all()
Example #6
0
def test_frienemy_all_classifiers_crosses(example_all_ones):
    X, y, neighbors, _, dsel_processed, _ = example_all_ones
    result = frienemy_pruning_preprocessed(neighbors, y, dsel_processed)
    assert result.all()
Example #7
0
    def predict_proba(self, X):
        """Estimates the posterior probabilities for sample in X.

        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            The input data.

        Returns
        -------
        predicted_proba : array of shape (n_samples, n_classes)
                          Probabilities estimates for each sample in X.
        """
        # Check if the DS model was trained
        check_is_fitted(self,
                        ["DSEL_processed_", "DSEL_data_", "DSEL_target_"])

        # Check if X is a valid input
        X = check_array(X, ensure_2d=False)

        # Check if the base classifiers are able to estimate posterior
        # probabilities (implements predict_proba method).
        self._check_predict_proba()

        base_probabilities = self._predict_proba_base(X)
        base_predictions = base_probabilities.argmax(axis=2)

        n_samples = X.shape[0]
        predicted_proba = np.zeros((n_samples, self.n_classes_))

        all_agree_vector = BaseDS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        if ind_all_agree.size:
            predicted_proba[ind_all_agree] = base_probabilities[
                ind_all_agree].mean(axis=1)

        ind_disagreement = np.where(~all_agree_vector)[0]

        if ind_disagreement.size:
            X_DS = X[ind_disagreement, :]

            # Always calculating the neighborhood. Passing that to classify
            # later
            # TODO: Check problems with DES Clustering method. Maybe add a
            # check to prevent that here. (or do clustering instead)
            # Then, we estimate the nearest neighbors for all samples that we
            # need to call DS routines
            distances, neighbors = self._get_region_competence(X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with
                # each sample
                hardness = hardness_region_competence(neighbors,
                                                      self.DSEL_target_,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples.
                # Samples with low hardness are passed down to the knn
                # classifier while samples with high hardness are passed down
                # to the DS methods. So, here we split the samples that are
                # passed to down to each stage by calculating their indices_.
                easy_samples_mask = hardness < self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by
                    # the knn method here:
                    # First get the class associated with each neighbor

                    # Accessing which samples in the original matrix are
                    # associated with the low instance hardness indices_.
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]

                    predicted_proba[ind_knn_original_matrix] = \
                        self.roc_algorithm_.predict_proba(
                            X_DS[ind_knn_classifier])

                    # Remove from the neighbors and distance matrices the
                    # samples that were classified using the KNN
                    neighbors = np.delete(neighbors,
                                          ind_knn_classifier,
                                          axis=0)
                    distances = np.delete(distances,
                                          ind_knn_classifier,
                                          axis=0)
            else:
                # IH was not considered. So all samples with disagreement are
                # passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            if ind_ds_classifier.size:
                # Check if the dynamic frienemy pruning should be used
                if self.DFP:
                    DFP_mask = frienemy_pruning_preprocessed(
                        neighbors, self.DSEL_target_, self.DSEL_processed_)
                else:
                    DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers_))

                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                proba_ds = self.predict_proba_with_ds(
                    X[ind_ds_original_matrix],
                    base_predictions[ind_ds_original_matrix],
                    base_probabilities[ind_ds_original_matrix],
                    neighbors=neighbors,
                    distances=distances,
                    DFP_mask=DFP_mask)

                predicted_proba[ind_ds_original_matrix] = proba_ds

        return predicted_proba
Example #8
0
    def predict(self, X):
        """Predict the class label for each sample in X.

        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            The input data.

        Returns
        -------
        predicted_labels : array of shape (n_samples)
                           Predicted class label for each sample in X.
        """
        # Check if the DS model was trained
        check_is_fitted(self,
                        ["DSEL_processed_", "DSEL_data_", "DSEL_target_"])

        # Check if X is a valid input
        X = check_array(X)
        self._check_num_features(X)

        n_samples = X.shape[0]
        predicted_labels = np.empty(n_samples, dtype=np.intp)

        if self.needs_proba:
            base_probabilities = self._predict_proba_base(X)
            base_predictions = base_probabilities.argmax(axis=2)
        else:
            base_probabilities = None
            base_predictions = self._predict_base(X)

        all_agree_vector = BaseDS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        # Since the predictions are always the same, get the predictions of the
        # first base classifier.
        if ind_all_agree.size:
            predicted_labels[ind_all_agree] = base_predictions[ind_all_agree,
                                                               0]

        # For the samples with disagreement, perform the dynamic selection
        # steps. First step is to collect the samples with disagreement
        # between base classifiers
        ind_disagreement = np.where(~all_agree_vector)[0]
        if ind_disagreement.size:

            X_DS = X[ind_disagreement, :]

            # If the method is based on clustering and does not use IH there
            # is no need to compute the Neighbors
            if hasattr(self, "clustering_") and not self.with_IH:
                distances = neighbors = None
            else:
                # Then, we estimate the nearest neighbors for all samples that
                # we need to call DS routines
                distances, neighbors = self._get_region_competence(X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with
                # each sample
                hardness = hardness_region_competence(neighbors,
                                                      self.DSEL_target_,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples.
                # Samples with low hardness are passed down to the knn
                # classifier while samples with high hardness are passed down
                # to the DS methods. So, here we split the samples that are
                # passed to down to each stage by calculating their indices_.
                easy_samples_mask = hardness < self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by
                    # the knn method here:
                    # First get the class associated with each neighbor
                    y_neighbors = self.DSEL_target_[neighbors[
                        ind_knn_classifier, :self.safe_k]]

                    # Accessing which samples in the original matrix are
                    # associated with the low instance hardness indices_. This
                    # is important since the low hardness indices
                    # ind_knn_classifier was estimated based on a subset
                    # of samples
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]
                    prediction_knn, _ = mode(y_neighbors, axis=1)
                    predicted_labels[
                        ind_knn_original_matrix] = prediction_knn.reshape(
                            -1, )

                    # Remove from the neighbors and distance matrices the
                    # samples that were classified using the KNN
                    neighbors = np.delete(neighbors,
                                          ind_knn_classifier,
                                          axis=0)
                    distances = np.delete(distances,
                                          ind_knn_classifier,
                                          axis=0)
            else:
                # IH was not considered. So all samples with disagreement are
                # passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            # At this stage the samples which all base classifiers agrees or
            # that are associated with low hardness were already classified.
            # The remaining samples are now passed down to the DS techniques
            # for classification.

            #  First check whether there are still samples to be classified.
            if ind_ds_classifier.size:

                # IF the DFP pruning is considered, calculate the DFP mask
                # for all samples in X
                if self.DFP:
                    DFP_mask = frienemy_pruning_preprocessed(
                        neighbors, self.DSEL_target_, self.DSEL_processed_)
                else:
                    DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers_))

                # Get the real indices_ of the samples that will be classified
                # using a DS algorithm.
                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                if self.needs_proba:
                    selected_probabilities = base_probabilities[
                        ind_ds_original_matrix]
                else:
                    selected_probabilities = None

                pred_ds = self.classify_with_ds(
                    X_DS[ind_ds_classifier],
                    base_predictions[ind_ds_original_matrix],
                    selected_probabilities,
                    neighbors=neighbors,
                    distances=distances,
                    DFP_mask=DFP_mask)
                predicted_labels[ind_ds_original_matrix] = pred_ds

        return self.classes_.take(predicted_labels)