def test_instance_hardness_region(index, expected, example_estimate_competence): y, neighbors = example_estimate_competence[1:3] k = 7 neighbors = neighbors[index, :] IH = hardness_region_competence(neighbors, y, k) assert np.isclose(IH, expected, atol=0.01)
def test_instance_hardness_region_batch(): expected = np.array([0.42, 0.28, 0.28]) y = y_dsel_ex1 k = 7 neighbors = neighbors_ex1 IH = hardness_region_competence(neighbors, y, k) assert np.allclose(IH, expected, atol=0.01)
def test_instance_hardness_region_batch(example_estimate_competence): expected = np.array([0.42, 0.28, 0.28]) y, neighbors = example_estimate_competence[1:3] k = 7 IH = hardness_region_competence(neighbors, y, k) assert np.allclose(IH, expected, atol=0.01)
def _split_easy_samples(self, neighbors): hardness = hardness_region_competence(neighbors, self.DSEL_target_, self.safe_k) # Get the index associated with the easy and hard samples. # easy samples are classified by the knn. easy_samples_mask = hardness < self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] return ind_ds_classifier, ind_knn_classifier
def test_instance_hardness_region(index, expected): y = y_dsel_ex1 k = 7 neighbors = neighbors_ex1[index, :] IH = hardness_region_competence(neighbors, y, k) assert np.isclose(IH, expected, atol=0.01)
def test_instance_hardness_region_all_same(): y = y_dsel_ex1 k = 7 neighbors = np.array([0, 1, 2, 6, 7, 8, 13]) IH = hardness_region_competence(neighbors, y, k) assert IH == 0.0
def predict_proba(self, X): """Estimates the posterior probabilities for sample in X. Parameters ---------- X : array of shape (n_samples, n_features) The input data. Returns ------- predicted_proba : array of shape (n_samples, n_classes) Probabilities estimates for each sample in X. """ # Check if the DS model was trained check_is_fitted(self, ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) # Check if X is a valid input X = check_array(X, ensure_2d=False) # Check if the base classifiers are able to estimate posterior # probabilities (implements predict_proba method). self._check_predict_proba() base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) n_samples = X.shape[0] predicted_proba = np.zeros((n_samples, self.n_classes_)) all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] if ind_all_agree.size: predicted_proba[ind_all_agree] = base_probabilities[ ind_all_agree].mean(axis=1) ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] # Always calculating the neighborhood. Passing that to classify # later # TODO: Check problems with DES Clustering method. Maybe add a # check to prevent that here. (or do clustering instead) # Then, we estimate the nearest neighbors for all samples that we # need to call DS routines distances, neighbors = self._get_region_competence(X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with # each sample hardness = hardness_region_competence(neighbors, self.DSEL_target_, self.safe_k) # Get the index associated with the easy and hard samples. # Samples with low hardness are passed down to the knn # classifier while samples with high hardness are passed down # to the DS methods. So, here we split the samples that are # passed to down to each stage by calculating their indices_. easy_samples_mask = hardness < self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by # the knn method here: # First get the class associated with each neighbor # Accessing which samples in the original matrix are # associated with the low instance hardness indices_. ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] predicted_proba[ind_knn_original_matrix] = \ self.roc_algorithm_.predict_proba( X_DS[ind_knn_classifier]) # Remove from the neighbors and distance matrices the # samples that were classified using the KNN neighbors = np.delete(neighbors, ind_knn_classifier, axis=0) distances = np.delete(distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are # passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) if ind_ds_classifier.size: # Check if the dynamic frienemy pruning should be used if self.DFP: DFP_mask = frienemy_pruning_preprocessed( neighbors, self.DSEL_target_, self.DSEL_processed_) else: DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers_)) ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] proba_ds = self.predict_proba_with_ds( X[ind_ds_original_matrix], base_predictions[ind_ds_original_matrix], base_probabilities[ind_ds_original_matrix], neighbors=neighbors, distances=distances, DFP_mask=DFP_mask) predicted_proba[ind_ds_original_matrix] = proba_ds return predicted_proba
def predict(self, X): """Predict the class label for each sample in X. Parameters ---------- X : array of shape (n_samples, n_features) The input data. Returns ------- predicted_labels : array of shape (n_samples) Predicted class label for each sample in X. """ # Check if the DS model was trained check_is_fitted(self, ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) # Check if X is a valid input X = check_array(X) self._check_num_features(X) n_samples = X.shape[0] predicted_labels = np.empty(n_samples, dtype=np.intp) if self.needs_proba: base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) else: base_probabilities = None base_predictions = self._predict_base(X) all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] # Since the predictions are always the same, get the predictions of the # first base classifier. if ind_all_agree.size: predicted_labels[ind_all_agree] = base_predictions[ind_all_agree, 0] # For the samples with disagreement, perform the dynamic selection # steps. First step is to collect the samples with disagreement # between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] # If the method is based on clustering and does not use IH there # is no need to compute the Neighbors if hasattr(self, "clustering_") and not self.with_IH: distances = neighbors = None else: # Then, we estimate the nearest neighbors for all samples that # we need to call DS routines distances, neighbors = self._get_region_competence(X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with # each sample hardness = hardness_region_competence(neighbors, self.DSEL_target_, self.safe_k) # Get the index associated with the easy and hard samples. # Samples with low hardness are passed down to the knn # classifier while samples with high hardness are passed down # to the DS methods. So, here we split the samples that are # passed to down to each stage by calculating their indices_. easy_samples_mask = hardness < self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by # the knn method here: # First get the class associated with each neighbor y_neighbors = self.DSEL_target_[neighbors[ ind_knn_classifier, :self.safe_k]] # Accessing which samples in the original matrix are # associated with the low instance hardness indices_. This # is important since the low hardness indices # ind_knn_classifier was estimated based on a subset # of samples ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] prediction_knn, _ = mode(y_neighbors, axis=1) predicted_labels[ ind_knn_original_matrix] = prediction_knn.reshape( -1, ) # Remove from the neighbors and distance matrices the # samples that were classified using the KNN neighbors = np.delete(neighbors, ind_knn_classifier, axis=0) distances = np.delete(distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are # passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) # At this stage the samples which all base classifiers agrees or # that are associated with low hardness were already classified. # The remaining samples are now passed down to the DS techniques # for classification. # First check whether there are still samples to be classified. if ind_ds_classifier.size: # IF the DFP pruning is considered, calculate the DFP mask # for all samples in X if self.DFP: DFP_mask = frienemy_pruning_preprocessed( neighbors, self.DSEL_target_, self.DSEL_processed_) else: DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers_)) # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] if self.needs_proba: selected_probabilities = base_probabilities[ ind_ds_original_matrix] else: selected_probabilities = None pred_ds = self.classify_with_ds( X_DS[ind_ds_classifier], base_predictions[ind_ds_original_matrix], selected_probabilities, neighbors=neighbors, distances=distances, DFP_mask=DFP_mask) predicted_labels[ind_ds_original_matrix] = pred_ds return self.classes_.take(predicted_labels)
def test_instance_hardness_region_all_same(example_estimate_competence): y = example_estimate_competence[1] k = 7 neighbors = np.array([0, 1, 2, 6, 7, 8, 13]) IH = hardness_region_competence(neighbors, y, k) assert IH == 0.0
def predict_proba(self, X): """Estimates the posterior probabilities for sample in X. Parameters ---------- X : array of shape = [n_samples, n_features] The input data. Returns ------- predicted_proba : array of shape = [n_samples, n_classes] Probabilities estimates for each sample in X. """ # Check if the DS model was trained self._check_is_fitted() # Check if X is a valid input self._check_input_predict(X) # Check if the base classifiers are able to estimate posterior probabilities (implements predict_proba method). self._check_predict_proba() base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) n_samples = X.shape[0] predicted_proba = np.zeros((n_samples, self.n_classes)) all_agree_vector = DS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] if ind_all_agree.size: predicted_proba[ind_all_agree] = base_probabilities[ ind_all_agree].mean(axis=1) ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] if self.with_IH or self.DFP: self.distances, self.neighbors = self._get_region_competence( X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with each sample hardness = hardness_region_competence(self.neighbors, self.DSEL_target, self.safe_k) # Get the index associated with the easy and hard samples. Samples with low hardness are # passed down to the knn classifier while samples with high hardness are passed down to # the DS methods. So, here we split the samples that are passed to down to each stage by # calculating their indices. easy_samples_mask = hardness <= self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by the knn method here: # First get the class associated with each neighbor # Accessing which samples in the original matrix are associated with the # low instance hardness indices. ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] predicted_proba[ind_knn_original_matrix] = \ self.roc_algorithm.predict_proba(X_DS[ind_knn_classifier]) # Remove from the neighbors and distance matrices the samples that were classified using the KNN self.neighbors = np.delete(self.neighbors, ind_knn_classifier, axis=0) self.distances = np.delete(self.distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) if ind_ds_classifier.size: # Check if the dynamic frienemy pruning should be used if self.DFP: self.DFP_mask = self._frienemy_pruning() else: self.DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers)) ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] proba_ds = self.predict_proba_with_ds( X[ind_ds_original_matrix], base_predictions[ind_ds_original_matrix], base_probabilities[ind_ds_original_matrix]) predicted_proba[ind_ds_original_matrix] = proba_ds # Reset the neighbors and the distances as they are specific to a given query. self.neighbors = None self.distances = None return predicted_proba
def predict(self, X): """Predict the class label for each sample in X. Parameters ---------- X : array of shape = [n_samples, n_features] The input data. Returns ------- predicted_labels : array of shape = [n_samples] Predicted class label for each sample in X. """ # Check if the DS model was trained self._check_is_fitted() # Check if X is a valid input self._check_input_predict(X) n_samples = X.shape[0] predicted_labels = np.empty(n_samples, dtype=np.intp) if self.needs_proba: base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) else: base_probabilities = None base_predictions = self._predict_base(X) all_agree_vector = DS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] # Since the predictions are always the same, get the predictions of the first base classifier. if ind_all_agree.size: predicted_labels[ind_all_agree] = base_predictions[ind_all_agree, 0] # For the samples with disagreement, perform the dynamic selection steps. First step is to collect the samples # with disagreement between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] if ind_disagreement.size: X_DS = X[ind_disagreement, :] # Estimate the neighbors at this stage if either the DFP or the IH is used during classification if self.DFP or self.with_IH: # Then, we estimate the nearest neighbors for all samples that we need to call DS routines self.distances, self.neighbors = self._get_region_competence( X_DS) if self.with_IH: # if IH is used, calculate the hardness level associated with each sample hardness = hardness_region_competence(self.neighbors, self.DSEL_target, self.safe_k) # Get the index associated with the easy and hard samples. Samples with low hardness are # passed down to the knn classifier while samples with high hardness are passed down to # the DS methods. So, here we split the samples that are passed to down to each stage by # calculating their indices. easy_samples_mask = hardness <= self.IH_rate ind_knn_classifier = np.where(easy_samples_mask)[0] ind_ds_classifier = np.where(~easy_samples_mask)[0] if ind_knn_classifier.size: # all samples with low hardness should be classified by the knn method here: # First get the class associated with each neighbor y_neighbors = self.DSEL_target[self.neighbors[ ind_knn_classifier, :self.safe_k]] # Accessing which samples in the original matrix are associated with the low # instance hardness indices. This is important since the low hardness indices # ind_knn_classifier was estimated based on a subset of samples ind_knn_original_matrix = ind_disagreement[ ind_knn_classifier] prediction_knn, _ = mode(y_neighbors, axis=1) predicted_labels[ ind_knn_original_matrix] = prediction_knn.reshape( -1, ) # Remove from the neighbors and distance matrices the samples that were classified using the KNN self.neighbors = np.delete(self.neighbors, ind_knn_classifier, axis=0) self.distances = np.delete(self.distances, ind_knn_classifier, axis=0) else: # IH was not considered. So all samples with disagreement are passed down to the DS algorithm ind_ds_classifier = np.arange(ind_disagreement.size) # At this stage the samples which all base classifiers agrees or that are associated with low hardness # Were already classified. The remaining samples are now passed down to the DS techniques # for classification. # First check whether there are still samples to be classified. if ind_ds_classifier.size: # IF the DFP pruning is considered, calculate the DFP mask for all samples in X if self.DFP: self.DFP_mask = self._frienemy_pruning() else: self.DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers)) # Get the real indices of the samples that will be classified using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] if self.needs_proba: selected_probabilities = base_probabilities[ ind_ds_original_matrix] else: selected_probabilities = None pred_ds = self.classify_with_ds( X_DS[ind_ds_classifier], base_predictions[ind_ds_original_matrix], selected_probabilities) predicted_labels[ind_ds_original_matrix] = pred_ds self.neighbors = None self.distances = None return self.classes.take(predicted_labels)