Exemple #1
0
def test_instance_hardness_region(index, expected,
                                  example_estimate_competence):
    y, neighbors = example_estimate_competence[1:3]
    k = 7
    neighbors = neighbors[index, :]
    IH = hardness_region_competence(neighbors, y, k)
    assert np.isclose(IH, expected, atol=0.01)
def test_instance_hardness_region_batch():
    expected = np.array([0.42, 0.28, 0.28])
    y = y_dsel_ex1
    k = 7
    neighbors = neighbors_ex1
    IH = hardness_region_competence(neighbors, y, k)
    assert np.allclose(IH, expected, atol=0.01)
Exemple #3
0
def test_instance_hardness_region_batch(example_estimate_competence):
    expected = np.array([0.42, 0.28, 0.28])
    y, neighbors = example_estimate_competence[1:3]

    k = 7
    IH = hardness_region_competence(neighbors, y, k)
    assert np.allclose(IH, expected, atol=0.01)
Exemple #4
0
 def _split_easy_samples(self, neighbors):
     hardness = hardness_region_competence(neighbors, self.DSEL_target_,
                                           self.safe_k)
     # Get the index associated with the easy and hard samples.
     # easy samples are classified by the knn.
     easy_samples_mask = hardness < self.IH_rate
     ind_knn_classifier = np.where(easy_samples_mask)[0]
     ind_ds_classifier = np.where(~easy_samples_mask)[0]
     return ind_ds_classifier, ind_knn_classifier
def test_instance_hardness_region(index, expected):
    y = y_dsel_ex1
    k = 7
    neighbors = neighbors_ex1[index, :]
    IH = hardness_region_competence(neighbors, y, k)
    assert np.isclose(IH, expected, atol=0.01)
def test_instance_hardness_region_all_same():
    y = y_dsel_ex1
    k = 7
    neighbors = np.array([0, 1, 2, 6, 7, 8, 13])
    IH = hardness_region_competence(neighbors, y, k)
    assert IH == 0.0
Exemple #7
0
    def predict_proba(self, X):
        """Estimates the posterior probabilities for sample in X.

        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            The input data.

        Returns
        -------
        predicted_proba : array of shape (n_samples, n_classes)
                          Probabilities estimates for each sample in X.
        """
        # Check if the DS model was trained
        check_is_fitted(self,
                        ["DSEL_processed_", "DSEL_data_", "DSEL_target_"])

        # Check if X is a valid input
        X = check_array(X, ensure_2d=False)

        # Check if the base classifiers are able to estimate posterior
        # probabilities (implements predict_proba method).
        self._check_predict_proba()

        base_probabilities = self._predict_proba_base(X)
        base_predictions = base_probabilities.argmax(axis=2)

        n_samples = X.shape[0]
        predicted_proba = np.zeros((n_samples, self.n_classes_))

        all_agree_vector = BaseDS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        if ind_all_agree.size:
            predicted_proba[ind_all_agree] = base_probabilities[
                ind_all_agree].mean(axis=1)

        ind_disagreement = np.where(~all_agree_vector)[0]

        if ind_disagreement.size:
            X_DS = X[ind_disagreement, :]

            # Always calculating the neighborhood. Passing that to classify
            # later
            # TODO: Check problems with DES Clustering method. Maybe add a
            # check to prevent that here. (or do clustering instead)
            # Then, we estimate the nearest neighbors for all samples that we
            # need to call DS routines
            distances, neighbors = self._get_region_competence(X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with
                # each sample
                hardness = hardness_region_competence(neighbors,
                                                      self.DSEL_target_,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples.
                # Samples with low hardness are passed down to the knn
                # classifier while samples with high hardness are passed down
                # to the DS methods. So, here we split the samples that are
                # passed to down to each stage by calculating their indices_.
                easy_samples_mask = hardness < self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by
                    # the knn method here:
                    # First get the class associated with each neighbor

                    # Accessing which samples in the original matrix are
                    # associated with the low instance hardness indices_.
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]

                    predicted_proba[ind_knn_original_matrix] = \
                        self.roc_algorithm_.predict_proba(
                            X_DS[ind_knn_classifier])

                    # Remove from the neighbors and distance matrices the
                    # samples that were classified using the KNN
                    neighbors = np.delete(neighbors,
                                          ind_knn_classifier,
                                          axis=0)
                    distances = np.delete(distances,
                                          ind_knn_classifier,
                                          axis=0)
            else:
                # IH was not considered. So all samples with disagreement are
                # passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            if ind_ds_classifier.size:
                # Check if the dynamic frienemy pruning should be used
                if self.DFP:
                    DFP_mask = frienemy_pruning_preprocessed(
                        neighbors, self.DSEL_target_, self.DSEL_processed_)
                else:
                    DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers_))

                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                proba_ds = self.predict_proba_with_ds(
                    X[ind_ds_original_matrix],
                    base_predictions[ind_ds_original_matrix],
                    base_probabilities[ind_ds_original_matrix],
                    neighbors=neighbors,
                    distances=distances,
                    DFP_mask=DFP_mask)

                predicted_proba[ind_ds_original_matrix] = proba_ds

        return predicted_proba
Exemple #8
0
    def predict(self, X):
        """Predict the class label for each sample in X.

        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            The input data.

        Returns
        -------
        predicted_labels : array of shape (n_samples)
                           Predicted class label for each sample in X.
        """
        # Check if the DS model was trained
        check_is_fitted(self,
                        ["DSEL_processed_", "DSEL_data_", "DSEL_target_"])

        # Check if X is a valid input
        X = check_array(X)
        self._check_num_features(X)

        n_samples = X.shape[0]
        predicted_labels = np.empty(n_samples, dtype=np.intp)

        if self.needs_proba:
            base_probabilities = self._predict_proba_base(X)
            base_predictions = base_probabilities.argmax(axis=2)
        else:
            base_probabilities = None
            base_predictions = self._predict_base(X)

        all_agree_vector = BaseDS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        # Since the predictions are always the same, get the predictions of the
        # first base classifier.
        if ind_all_agree.size:
            predicted_labels[ind_all_agree] = base_predictions[ind_all_agree,
                                                               0]

        # For the samples with disagreement, perform the dynamic selection
        # steps. First step is to collect the samples with disagreement
        # between base classifiers
        ind_disagreement = np.where(~all_agree_vector)[0]
        if ind_disagreement.size:

            X_DS = X[ind_disagreement, :]

            # If the method is based on clustering and does not use IH there
            # is no need to compute the Neighbors
            if hasattr(self, "clustering_") and not self.with_IH:
                distances = neighbors = None
            else:
                # Then, we estimate the nearest neighbors for all samples that
                # we need to call DS routines
                distances, neighbors = self._get_region_competence(X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with
                # each sample
                hardness = hardness_region_competence(neighbors,
                                                      self.DSEL_target_,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples.
                # Samples with low hardness are passed down to the knn
                # classifier while samples with high hardness are passed down
                # to the DS methods. So, here we split the samples that are
                # passed to down to each stage by calculating their indices_.
                easy_samples_mask = hardness < self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by
                    # the knn method here:
                    # First get the class associated with each neighbor
                    y_neighbors = self.DSEL_target_[neighbors[
                        ind_knn_classifier, :self.safe_k]]

                    # Accessing which samples in the original matrix are
                    # associated with the low instance hardness indices_. This
                    # is important since the low hardness indices
                    # ind_knn_classifier was estimated based on a subset
                    # of samples
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]
                    prediction_knn, _ = mode(y_neighbors, axis=1)
                    predicted_labels[
                        ind_knn_original_matrix] = prediction_knn.reshape(
                            -1, )

                    # Remove from the neighbors and distance matrices the
                    # samples that were classified using the KNN
                    neighbors = np.delete(neighbors,
                                          ind_knn_classifier,
                                          axis=0)
                    distances = np.delete(distances,
                                          ind_knn_classifier,
                                          axis=0)
            else:
                # IH was not considered. So all samples with disagreement are
                # passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            # At this stage the samples which all base classifiers agrees or
            # that are associated with low hardness were already classified.
            # The remaining samples are now passed down to the DS techniques
            # for classification.

            #  First check whether there are still samples to be classified.
            if ind_ds_classifier.size:

                # IF the DFP pruning is considered, calculate the DFP mask
                # for all samples in X
                if self.DFP:
                    DFP_mask = frienemy_pruning_preprocessed(
                        neighbors, self.DSEL_target_, self.DSEL_processed_)
                else:
                    DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers_))

                # Get the real indices_ of the samples that will be classified
                # using a DS algorithm.
                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                if self.needs_proba:
                    selected_probabilities = base_probabilities[
                        ind_ds_original_matrix]
                else:
                    selected_probabilities = None

                pred_ds = self.classify_with_ds(
                    X_DS[ind_ds_classifier],
                    base_predictions[ind_ds_original_matrix],
                    selected_probabilities,
                    neighbors=neighbors,
                    distances=distances,
                    DFP_mask=DFP_mask)
                predicted_labels[ind_ds_original_matrix] = pred_ds

        return self.classes_.take(predicted_labels)
Exemple #9
0
def test_instance_hardness_region_all_same(example_estimate_competence):
    y = example_estimate_competence[1]
    k = 7
    neighbors = np.array([0, 1, 2, 6, 7, 8, 13])
    IH = hardness_region_competence(neighbors, y, k)
    assert IH == 0.0
Exemple #10
0
    def predict_proba(self, X):
        """Estimates the posterior probabilities for sample in X.

        Parameters
        ----------
        X : array of shape = [n_samples, n_features]
            The input data.

        Returns
        -------
        predicted_proba : array of shape = [n_samples, n_classes]
                          Probabilities estimates for each sample in X.
        """
        # Check if the DS model was trained
        self._check_is_fitted()

        # Check if X is a valid input
        self._check_input_predict(X)

        # Check if the base classifiers are able to estimate posterior probabilities (implements predict_proba method).
        self._check_predict_proba()

        base_probabilities = self._predict_proba_base(X)
        base_predictions = base_probabilities.argmax(axis=2)

        n_samples = X.shape[0]
        predicted_proba = np.zeros((n_samples, self.n_classes))

        all_agree_vector = DS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        if ind_all_agree.size:
            predicted_proba[ind_all_agree] = base_probabilities[
                ind_all_agree].mean(axis=1)

        ind_disagreement = np.where(~all_agree_vector)[0]

        if ind_disagreement.size:
            X_DS = X[ind_disagreement, :]

            if self.with_IH or self.DFP:
                self.distances, self.neighbors = self._get_region_competence(
                    X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with each sample
                hardness = hardness_region_competence(self.neighbors,
                                                      self.DSEL_target,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples. Samples with low hardness are
                # passed down to the knn classifier while samples with high hardness are passed down to
                # the DS methods. So, here we split the samples that are passed to down to each stage by
                # calculating their indices.
                easy_samples_mask = hardness <= self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by the knn method here:
                    # First get the class associated with each neighbor

                    # Accessing which samples in the original matrix are associated with the
                    # low instance hardness indices.
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]

                    predicted_proba[ind_knn_original_matrix] = \
                        self.roc_algorithm.predict_proba(X_DS[ind_knn_classifier])

                    # Remove from the neighbors and distance matrices the samples that were classified using the KNN
                    self.neighbors = np.delete(self.neighbors,
                                               ind_knn_classifier,
                                               axis=0)
                    self.distances = np.delete(self.distances,
                                               ind_knn_classifier,
                                               axis=0)
            else:
                # IH was not considered. So all samples with disagreement are passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            if ind_ds_classifier.size:
                # Check if the dynamic frienemy pruning should be used
                if self.DFP:
                    self.DFP_mask = self._frienemy_pruning()
                else:
                    self.DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers))

                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                proba_ds = self.predict_proba_with_ds(
                    X[ind_ds_original_matrix],
                    base_predictions[ind_ds_original_matrix],
                    base_probabilities[ind_ds_original_matrix])

                predicted_proba[ind_ds_original_matrix] = proba_ds

        # Reset the neighbors and the distances as they are specific to a given query.
        self.neighbors = None
        self.distances = None
        return predicted_proba
Exemple #11
0
    def predict(self, X):
        """Predict the class label for each sample in X.

        Parameters
        ----------
        X : array of shape = [n_samples, n_features]
            The input data.

        Returns
        -------
        predicted_labels : array of shape = [n_samples]
                           Predicted class label for each sample in X.
        """
        # Check if the DS model was trained
        self._check_is_fitted()
        # Check if X is a valid input
        self._check_input_predict(X)

        n_samples = X.shape[0]
        predicted_labels = np.empty(n_samples, dtype=np.intp)

        if self.needs_proba:
            base_probabilities = self._predict_proba_base(X)
            base_predictions = base_probabilities.argmax(axis=2)
        else:
            base_probabilities = None
            base_predictions = self._predict_base(X)

        all_agree_vector = DS._all_classifier_agree(base_predictions)
        ind_all_agree = np.where(all_agree_vector)[0]

        # Since the predictions are always the same, get the predictions of the first base classifier.
        if ind_all_agree.size:
            predicted_labels[ind_all_agree] = base_predictions[ind_all_agree,
                                                               0]

        # For the samples with disagreement, perform the dynamic selection steps. First step is to collect the samples
        # with disagreement between base classifiers
        ind_disagreement = np.where(~all_agree_vector)[0]
        if ind_disagreement.size:

            X_DS = X[ind_disagreement, :]

            # Estimate the neighbors at this stage if either the DFP or the IH is used during classification
            if self.DFP or self.with_IH:
                # Then, we estimate the nearest neighbors for all samples that we need to call DS routines
                self.distances, self.neighbors = self._get_region_competence(
                    X_DS)

            if self.with_IH:
                # if IH is used, calculate the hardness level associated with each sample
                hardness = hardness_region_competence(self.neighbors,
                                                      self.DSEL_target,
                                                      self.safe_k)

                # Get the index associated with the easy and hard samples. Samples with low hardness are
                # passed down to the knn classifier while samples with high hardness are passed down to
                # the DS methods. So, here we split the samples that are passed to down to each stage by
                # calculating their indices.
                easy_samples_mask = hardness <= self.IH_rate
                ind_knn_classifier = np.where(easy_samples_mask)[0]
                ind_ds_classifier = np.where(~easy_samples_mask)[0]

                if ind_knn_classifier.size:
                    # all samples with low hardness should be classified by the knn method here:
                    # First get the class associated with each neighbor
                    y_neighbors = self.DSEL_target[self.neighbors[
                        ind_knn_classifier, :self.safe_k]]

                    # Accessing which samples in the original matrix are associated with the low
                    # instance hardness indices. This is important since the low hardness indices
                    # ind_knn_classifier was estimated based on a subset of samples
                    ind_knn_original_matrix = ind_disagreement[
                        ind_knn_classifier]
                    prediction_knn, _ = mode(y_neighbors, axis=1)
                    predicted_labels[
                        ind_knn_original_matrix] = prediction_knn.reshape(
                            -1, )

                    # Remove from the neighbors and distance matrices the samples that were classified using the KNN
                    self.neighbors = np.delete(self.neighbors,
                                               ind_knn_classifier,
                                               axis=0)
                    self.distances = np.delete(self.distances,
                                               ind_knn_classifier,
                                               axis=0)
            else:
                # IH was not considered. So all samples with disagreement are passed down to the DS algorithm
                ind_ds_classifier = np.arange(ind_disagreement.size)

            # At this stage the samples which all base classifiers agrees or that are associated with low hardness
            # Were already classified. The remaining samples are now passed down to the DS techniques
            # for classification.

            #  First check whether there are still samples to be classified.
            if ind_ds_classifier.size:

                # IF the DFP pruning is considered, calculate the DFP mask for all samples in X
                if self.DFP:
                    self.DFP_mask = self._frienemy_pruning()
                else:
                    self.DFP_mask = np.ones(
                        (ind_ds_classifier.size, self.n_classifiers))

                # Get the real indices of the samples that will be classified using a DS algorithm.
                ind_ds_original_matrix = ind_disagreement[ind_ds_classifier]

                if self.needs_proba:
                    selected_probabilities = base_probabilities[
                        ind_ds_original_matrix]
                else:
                    selected_probabilities = None

                pred_ds = self.classify_with_ds(
                    X_DS[ind_ds_classifier],
                    base_predictions[ind_ds_original_matrix],
                    selected_probabilities)
                predicted_labels[ind_ds_original_matrix] = pred_ds

        self.neighbors = None
        self.distances = None

        return self.classes.take(predicted_labels)