def test_frienemy_not_all_classifiers_crosses(example_estimate_competence): expected = np.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) _, y, neighbors, _, dsel_processed, _ = example_estimate_competence # passing three samples to compute the DFP at the same time result = frienemy_pruning_preprocessed(neighbors[:, :3], y, dsel_processed) assert np.array_equal(result, expected)
def _get_DFP_mask(self, neighbors): if self.DFP: DFP_mask = frienemy_pruning_preprocessed(neighbors, self.DSEL_target_, self.DSEL_processed_) else: DFP_mask = np.ones((neighbors.shape[0], self.n_classifiers_)) return DFP_mask
def test_DFP_is_used(example_estimate_competence, create_pool_classifiers): X, y, neighbors, _, dsel_processed, _ = example_estimate_competence safe_k = 3 ds_test = BaseDS(create_pool_classifiers, DFP=True, safe_k=safe_k) ds_test.fit(X, y) ds_test.DSEL_processed_ = dsel_processed DFP_mask = frienemy_pruning_preprocessed(neighbors[0, :safe_k], y, dsel_processed) assert np.array_equal(DFP_mask, np.atleast_2d([1, 1, 0]))
def test_frienemy_no_classifier_crosses(example_estimate_competence): _, y, neighbors = example_estimate_competence[0:3] n_classifiers = 3 predictions = np.zeros((y.size, n_classifiers)) mask = frienemy_pruning_preprocessed(neighbors, y, predictions) assert mask.all()
def test_frienemy_safe_region(example_estimate_competence): X, y, _, _, dsel_processed, _ = example_estimate_competence neighbors = np.tile(np.array([0, 1, 2, 6, 7, 8, 14]), (10, 1)) result = frienemy_pruning_preprocessed(neighbors, y, dsel_processed) assert result.all()
def test_frienemy_all_classifiers_crosses(example_all_ones): X, y, neighbors, _, dsel_processed, _ = example_all_ones result = frienemy_pruning_preprocessed(neighbors, y, dsel_processed) assert result.all()
def predict_proba(self, X): """Estimates the posterior probabilities for sample in X. Parameters ---------- X : array of shape (n_samples, n_features) The input data. Returns ------- predicted_proba : array of shape (n_samples, n_classes) Probabilities estimates for each sample in X. """ # Check if the DS model was trained check_is_fitted(self, ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) # Check if X is a valid input X = check_array(X, ensure_2d=False) # Check if the base classifiers are able to estimate posterior # probabilities (implements predict_proba method). self._check_predict_proba() base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) n_samples = X.shape[0] predicted_proba = np.zeros((n_samples, self.n_classes_)) all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] if ind_all_agree.size: predicted_proba[ind_all_agree] = base_probabilities[ ind_all_agree].mean(axis=1) ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] # Always calculating the neighborhood. Passing that to classify # later # TODO: Check problems with DES Clustering method. Maybe add a # check to prevent that here. (or do clustering instead) # Then, we estimate the nearest neighbors for all samples that we # need to call DS routines distances, neighbors = self._get_region_competence(X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with # each sample hardness = hardness_region_competence(neighbors, self.DSEL_target_, self.safe_k) # Get the index associated with the easy and hard samples. # Samples with low hardness are passed down to the knn # classifier while samples with high hardness are passed down # to the DS methods. So, here we split the samples that are # passed to down to each stage by calculating their indices_. easy_samples_mask = hardness < self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by # the knn method here: # First get the class associated with each neighbor # Accessing which samples in the original matrix are # associated with the low instance hardness indices_. ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] predicted_proba[ind_knn_original_matrix] = \ self.roc_algorithm_.predict_proba( X_DS[ind_knn_classifier]) # Remove from the neighbors and distance matrices the # samples that were classified using the KNN neighbors = np.delete(neighbors, ind_knn_classifier, axis=0) distances = np.delete(distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are # passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) if ind_ds_classifier.size: # Check if the dynamic frienemy pruning should be used if self.DFP: DFP_mask = frienemy_pruning_preprocessed( neighbors, self.DSEL_target_, self.DSEL_processed_) else: DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers_)) ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] proba_ds = self.predict_proba_with_ds( X[ind_ds_original_matrix], base_predictions[ind_ds_original_matrix], base_probabilities[ind_ds_original_matrix], neighbors=neighbors, distances=distances, DFP_mask=DFP_mask) predicted_proba[ind_ds_original_matrix] = proba_ds return predicted_proba
def predict(self, X): """Predict the class label for each sample in X. Parameters ---------- X : array of shape (n_samples, n_features) The input data. Returns ------- predicted_labels : array of shape (n_samples) Predicted class label for each sample in X. """ # Check if the DS model was trained check_is_fitted(self, ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) # Check if X is a valid input X = check_array(X) self._check_num_features(X) n_samples = X.shape[0] predicted_labels = np.empty(n_samples, dtype=np.intp) if self.needs_proba: base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) else: base_probabilities = None base_predictions = self._predict_base(X) all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] # Since the predictions are always the same, get the predictions of the # first base classifier. if ind_all_agree.size: predicted_labels[ind_all_agree] = base_predictions[ind_all_agree, 0] # For the samples with disagreement, perform the dynamic selection # steps. First step is to collect the samples with disagreement # between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] # If the method is based on clustering and does not use IH there # is no need to compute the Neighbors if hasattr(self, "clustering_") and not self.with_IH: distances = neighbors = None else: # Then, we estimate the nearest neighbors for all samples that # we need to call DS routines distances, neighbors = self._get_region_competence(X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with # each sample hardness = hardness_region_competence(neighbors, self.DSEL_target_, self.safe_k) # Get the index associated with the easy and hard samples. # Samples with low hardness are passed down to the knn # classifier while samples with high hardness are passed down # to the DS methods. So, here we split the samples that are # passed to down to each stage by calculating their indices_. easy_samples_mask = hardness < self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by # the knn method here: # First get the class associated with each neighbor y_neighbors = self.DSEL_target_[neighbors[ ind_knn_classifier, :self.safe_k]] # Accessing which samples in the original matrix are # associated with the low instance hardness indices_. This # is important since the low hardness indices # ind_knn_classifier was estimated based on a subset # of samples ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] prediction_knn, _ = mode(y_neighbors, axis=1) predicted_labels[ ind_knn_original_matrix] = prediction_knn.reshape( -1, ) # Remove from the neighbors and distance matrices the # samples that were classified using the KNN neighbors = np.delete(neighbors, ind_knn_classifier, axis=0) distances = np.delete(distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are # passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) # At this stage the samples which all base classifiers agrees or # that are associated with low hardness were already classified. # The remaining samples are now passed down to the DS techniques # for classification. # First check whether there are still samples to be classified. if ind_ds_classifier.size: # IF the DFP pruning is considered, calculate the DFP mask # for all samples in X if self.DFP: DFP_mask = frienemy_pruning_preprocessed( neighbors, self.DSEL_target_, self.DSEL_processed_) else: DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers_)) # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] if self.needs_proba: selected_probabilities = base_probabilities[ ind_ds_original_matrix] else: selected_probabilities = None pred_ds = self.classify_with_ds( X_DS[ind_ds_classifier], base_predictions[ind_ds_original_matrix], selected_probabilities, neighbors=neighbors, distances=distances, DFP_mask=DFP_mask) predicted_labels[ind_ds_original_matrix] = pred_ds return self.classes_.take(predicted_labels)