def _clean_nn(self, x): """ Performs cleaning in the nearest neighborhood of x. :param x: Single observation. """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: if self._class_of(neighbor) in self.majority_classes and \ self._class_of(neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()): self.DS = setdiff(self.DS, np.array([neighbor])) self.RS = setdiff(self.RS, np.array([neighbor]))
def _fit_resample(self, X, y): """ Performs resampling :param X: Numpy array of examples that is the subject of resampling. :param y: Numpy array of labels corresponding to examples from X. :return: Resampled X along with accordingly modified labels, resampled y """ self._initialize_algorithm(X, y) self.DS = np.append(X, y.reshape(y.shape[0], 1), axis=1) self._restart_perspective() self._calculate_weak_majority_examples() self._restore_perspective() self.DS = setdiff(self.DS, self.RS) int_classes, min_classes = self._sort_by_cardinality(y) for int_min_class in int_classes + min_classes: self.relabel(int_min_class) self.clean(int_min_class) self.amplify(int_min_class) self.DS = union(self.DS, self.AS) return self.DS[:, :-1], self.DS[:, -1]
def fit_classifier(args): clf, X, y, resampled, maj_int_min = args x_sampled, y_sampled = resampled out_of_bag = setdiff(np.hstack((X, y[:, np.newaxis])), np.hstack((x_sampled, y_sampled[:, np.newaxis]))) x_out, y_out = out_of_bag[:, :-1], out_of_bag[:, -1].astype(int) x_resampled, y_resampled = SOUP(maj_int_min=maj_int_min).fit_resample( x_sampled, y_sampled) clf.fit(x_resampled, y_resampled) result = clf.predict_proba(x_out) class_sum_prob = np.sum(result, axis=0) + 0.001 class_quantities = Counter(y_out) expected_sum_prob = np.array( [class_quantities[i] for i in range(len(Counter(y)))]) try: global_weights = expected_sum_prob / class_sum_prob except Exception: global_weights = np.ones(shape=len(Counter(y))) print( f'Exc {Counter(y)} {Counter(y_out)} {result.shape} {expected_sum_prob.shape} {class_sum_prob.shape}' ) return clf, global_weights
def test_setdiff(): arr1 = np.array([[1, 2, 3], [4, 5, 6]]) arr2 = np.array([[1, 2, 3]]) actual = setdiff(arr1, arr2) expected = np.array([[4, 5, 6]]) assert (actual == expected).all()
def _relabel_nn(self, x): """ Performs relabeling in the nearest neighborhood of x. :param x: An observation. """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: if contains(self.RS, neighbor) and self._class_of( neighbor) in self.majority_classes and self._class_of( neighbor) in self._min_cost_classes( x, self._ds_as_rs_union()): self.RS = setdiff(self.RS, np.array([neighbor])) neighbor[-1] = x[-1] self.AS = union(self.AS, np.array([neighbor]))
def _knn(self, x, DS): """ Returns k nearest neighbors of x in DS that belong to c class if specified. :param x: Single observation :param DS: DS :param c: Class of neighbors that should be returned. :return: These neighbors from k nearest that belong to class c if specified. Otherwise all of them. """ DS = setdiff(DS, np.array([x])) if DS.shape[0] < self.k: self.neigh_clf = NearestNeighbors(n_neighbors=DS.shape[0]) else: self.neigh_clf = NearestNeighbors(n_neighbors=self.k) self.neigh_clf.fit(DS[:, :-1]) within_radius = self.neigh_clf.radius_neighbors( [x[:-1]], radius=self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1] + 0.0001 * self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], return_distance=True) unique_distances = np.unique(sorted(within_radius[0][0])) all_distances = within_radius[0][0] all_indices = within_radius[1][0] indices = [] for dist in unique_distances: if len(indices) < self.k: indices += (all_indices[all_distances == dist]).tolist() return DS[indices]