class DCS(object): @abstractmethod def select(self, ensemble, x): pass def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn is None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted def get_neighbors(self, x, return_distance=False): # obtain the K nearest neighbors of test sample in the validation set if not return_distance: [idx] = self.knn.kneighbors(x, return_distance=return_distance) else: rd = return_distance [dists], [idx] = self.knn.kneighbors(x, return_distance=rd) X_nn = self.Xval[idx] # k neighbors y_nn = self.yval[idx] # k neighbors target if return_distance: return X_nn, y_nn, dists else: return X_nn, y_nn
class DCS(object): @abstractmethod def select(self, ensemble, x): pass def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn == None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted def get_neighbors(self, x, return_distance=False): # obtain the K nearest neighbors of test sample in the validation set if not return_distance: [idx] = self.knn.kneighbors(x, return_distance=return_distance) else: [dists], [idx] = self.knn.kneighbors(x, return_distance=return_distance) X_nn = self.Xval[idx] # k neighbors y_nn = self.yval[idx] # k neighbors target if return_distance: return X_nn, y_nn, dists else: return X_nn, y_nn
def _main_loop(self): exit_count = 0 knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute') while exit_count < len(self.groups): index, exit_count = 0, 0 while index < len(self.groups): group = self.groups[index] reps_x = np.asarray([g.rep_x for g in self.groups]) reps_y = np.asarray([g.label for g in self.groups]) knn.fit(reps_x, reps_y) nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False) nn_idx = nn_idx.T[0] mask = nn_idx == index # if all are correctly classified if not (False in mask): exit_count = exit_count + 1 # if all are misclasified elif not (group.label in reps_y[nn_idx]): pca = PCA(n_components=1) pca.fit(group.X) # maybe use a 'for' instead of creating array d = pca.transform(reps_x[index]) dis = [pca.transform(inst)[0] for inst in group.X] mask_split = (dis < d).flatten() new_X = group.X[mask_split] self.groups.append(_Group(new_X, group.label)) group.X = group.X[~mask_split] elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any(): mask_mv = nn_idx != index index_mv = np.asarray(range(len(group)))[mask_mv] X_mv = group.remove_instances(index_mv) G_mv = nn_idx[mask_mv] for x, g in zip(X_mv, G_mv): self.groups[g].add_instances([x]) elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis: mask_mv = reps_y[nn_idx] != group.label new_X = group.X[mask_mv] self.groups.append(_Group(new_X, group.label)) group.X = group.X[~mask_mv] else: exit_count = exit_count + 1 if len(group) == 0: self.groups.remove(group) else: index = index + 1 for g in self.groups: g.update_all() return self.groups
def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U
def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U
class SGP2(SGP): """Self-Generating Prototypes 2 The Self-Generating Prototypes 2 is the second version of the Self-Generating Prototypes algorithm. It has a higher generalization power, including the procedures merge and pruning. Parameters ---------- r_min: float, optional (default = 0.0) Determine the minimum size of a cluster [0.00, 0.20] r_mis: float, optional (default = 0.0) Determine the error tolerance before split a group Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.generation.sgp import SGP2 >>> import numpy as np >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])] >>> X = np.asarray(X).T >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1]) >>> sgp2 = SGP2() >>> sgp2.fit(X, y) SGP2(r_min=0.0, r_mis=0.0) >>> print sgp2.reduction_ 0.5 See also -------- protopy.generation.SGP: self-generating prototypes protopy.generation.sgp.ASGP: adaptive self-generating prototypes References ---------- Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007. """ def __init__(self, r_min=0.0, r_mis=0.0): self.groups = None self.r_min = r_min self.r_mis = r_mis self.n_neighbors = 1 self.classifier = None self.groups = None def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() self._merge() self._pruning() self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_ def _merge(self): if len(self.groups) < 2: return self.groups merged = False for group in self.groups: reps_x = np.asarray([g.rep_x for g in self.groups]) reps_y = np.asarray([g.label for g in self.groups]) self.classifier.fit(reps_x, reps_y) nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False) nn2_idx = nn2_idx.T[1] # could use a threshold if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label: ng_group = self.groups[nn2_idx[0]] ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False) ng2_idx = ng2_idx.T[1] if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group: group.add_instances(ng_group.X, update=True) self.groups.remove(ng_group) merged = True if merged: self._merge() return self.groups def _pruning(self): if len(self.groups) < 2: return self.groups pruned, fst = False, True knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute') while pruned or fst: index = 0 pruned, fst = False, False while index < len(self.groups): group = self.groups[index] mask = np.ones(len(self.groups), dtype=bool) mask[index] = False reps_x = np.asarray([g.rep_x for g in self.groups])[mask] reps_y = np.asarray([g.label for g in self.groups])[mask] labels = knn.fit(reps_x, reps_y).predict(group.X) if (labels == group.label).all(): self.groups.remove(group) pruned = True else: index = index + 1 if len(self.groups) == 1: index = len(self.groups) pruned = False return self.groups
class SSMA(InstanceReductionMixin): """Steady State Memetic Algorithm The Steady-State Memetic Algorithm is an evolutionary prototype selection algorithm. It uses a memetic algorithm in order to perform a local search in the code. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. alpha : float (default = 0.6) Parameter that ponderates the fitness function. max_loop : int (default = 1000) Number of maximum loops performed by the algorithm. threshold : int (default = 0) Threshold that regulates the substitution condition; chromosomes_count: int (default = 10) number of chromosomes used to find the optimal solution. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.ssma import SSMA >>> import numpy as np >>> X = np.array([[i] for i in range(100)]) >>> y = np.asarray(50 * [0] + 50 * [1]) >>> ssma = SSMA() >>> ssma.fit(X, y) SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0) >>> print ssma.predict([[40],[60]]) [0 1] >>> print ssma.reduction_ 0.98 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype selection based on a steady-state memetic algorithm: a study of scalability. Memetic Computing, 2(3):183–199, 2010. """ def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) def accuracy(self, chromosome, X, y): mask = np.asarray(chromosome, dtype=bool) cX, cy = X[mask], y[mask] #print len(cX), len(cy), sum(chromosome) self.classifier.fit(cX, cy) labels = self.classifier.predict(X) accuracy = (labels == y).sum() return float(accuracy) / len(y) def fitness(self, chromosome, X, y): #TODO add the possibility of use AUC for factor1 ac = self.accuracy(chromosome, X, y) rd = 1.0 - (float(sum(chromosome)) / len(chromosome)) return self.alpha * ac + (1.0 - self.alpha) * rd def fitness_gain(self, gain, n): return self.alpha * (float(gain) / n) + (1 - self.alpha) * (1.0 / n) def update_threshold(self, X, y): best_index = np.argmax(self.evaluations) chromosome = self.chromosomes[best_index] best_ac = self.accuracy(chromosome, X, y) best_rd = 1.0 - float(sum(chromosome)) / len(y) if best_ac <= self.best_chromosome_ac: self.threshold = self.threshold + 1 if best_rd <= self.best_chromosome_rd: self.threshold = self.threshold - 1 self.best_chromosome_ac = best_ac self.best_chromosome_rd = best_rd def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U def memetic_looper(self, S, R): c = 0 for i in range(len(S)): if S[i] == 1 and i not in R: c = c + 1 if c == 2: return True return False def memetic_select_j(self, S, R): indexs = [] for i in range(len(S)): if i not in R and S[i] == 1: indexs.append(i) # if list is empty wlil return error return np.random.choice(indexs) def generate_population(self, X, y): self.chromosomes = [[np.random.choice([0, 1]) for i in range(len(y))] for c in range(self.chromosomes_count)] self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes] self.update_threshold(X, y) def select_parents(self, X, y): parents = [] for i in range(2): samples = random.sample(self.chromosomes, 2) parents = parents + [ samples[0] if self.fitness(samples[0], X, y) > self.fitness( samples[1], X, y) else samples[1] ] return np.array(parents, copy=True) def crossover(self, parent_1, parent_2): size = len(parent_1) mask = [0] * (size / 2) + [1] * (size - size / 2) mask = np.asarray(mask, dtype=bool) np.random.shuffle(mask) off_1 = parent_1 * mask + parent_2 * ~mask off_2 = parent_2 * mask + parent_1 * ~mask return np.asarray([off_1, off_2]) def mutation(self, offspring): for i in range(len(offspring)): if np.random.uniform(0, 1) < 1.0 / len(offspring): offspring[i] = not offspring[i] return offspring def memetic_search(self, chromosome, X, y, chromosome_fitness=None): S = np.array(chromosome, copy=True) if S.sum() == 0: return S, 0 if chromosome_fitness == None: chromosome_fitness = self.fitness(chromosome, X, y) fitness_s = chromosome_fitness # List of visited genes in S R = [] # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i U = self.index_nearest_neighbor(S, X, y) while self.memetic_looper(S, R): j = self.memetic_select_j(S, R) S[j] = 0 gain = 0.0 U_copy = list(U) mask = np.asarray(S, dtype=bool) X_tra, y_tra = X[mask], y[mask] real_idx = np.asarray(range(len(y)))[mask] if len(y_tra) > 0: for i in range(len(U)): if U[i] == j: self.classifier.fit(X_tra, y_tra) [[idx] ] = self.classifier.kneighbors(X[i], n_neighbors=1, return_distance=False) U[i] = real_idx[idx] if y[i] == y[U_copy[i]] and y[i] != y[U[i]]: gain = gain - 1.0 if y[i] != y[U_copy[i]] and y[i] == y[U[i]]: gain = gain + 1.0 if gain >= self.threshold: n = S.sum() g = self.fitness_gain(gain, n) fitness_s = fitness_s + g R = [] else: U = U_copy S[j] = 1 R.append(j) return list(S), fitness_s def main_loop(self, X, y): self.generate_population(X, y) n, worse_fit_index = 0, -1 while (n < self.max_loop): parents = self.select_parents(X, y) offspring = self.crossover(parents[0], parents[1]) offspring[0] = self.mutation(offspring[0]) offspring[1] = self.mutation(offspring[1]) fit_offs = [ self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring ] if worse_fit_index == -1: worse_fit_index = np.argmin(self.evaluations) for i in range(len(offspring)): p_ls = 1.0 if fit_offs[i] == -1: p_ls = -1 if fit_offs[i] <= self.evaluations[worse_fit_index]: p_ls = 0.0625 if np.random.uniform(0, 1) < p_ls: offspring[i], fit_offs[i] = self.memetic_search( offspring[i], X, y, chromosome_fitness=fit_offs[i]) for i in range(len(offspring)): if fit_offs[i] > self.evaluations[worse_fit_index]: self.chromosomes[worse_fit_index] = offspring[i] self.evaluations[worse_fit_index] = fit_offs[i] worse_fit_index = np.argmin(self.evaluations) n = n + 1 if n % 10 == 0: self.update_threshold(X, y) def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class TomekLinks(InstanceReductionMixin): """Tomek Links. The Tomek Links algorithm removes a pair instances that forms a Tomek Link. This techniques removes instances in the decision region. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default in the classification (only). The Tomek Links uses only n_neighbors=1 in the reduction. keep_class : int, optional (default = None) Label of the class to not be removed in the tomek links. If None, it removes all nodes of the links. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.tomek_links import TomekLinks >>> import numpy as np >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]]) >>> y = np.array([1,1,2,1,2,2,2,1,2,2]) >>> tl = TomekLinks() >>> tl.fit(X, y) TomekLinks(keep_class=None) >>> print tl.predict([[2.5],[7.5]]) [1, 2] >>> print tl.reduction_ 0.4 See also -------- protopy.selection.enn.ENN: edited nearest neighbor References ---------- I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems, Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976. """ def __init__(self, n_neighbors=3, keep_class=None): self.n_neighbors = n_neighbors self.classifier = None self.keep_class = keep_class def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y==self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class SSMA(InstanceReductionMixin): """Steady State Memetic Algorithm The Steady-State Memetic Algorithm is an evolutionary prototype selection algorithm. It uses a memetic algorithm in order to perform a local search in the code. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. alpha : float (default = 0.6) Parameter that ponderates the fitness function. max_loop : int (default = 1000) Number of maximum loops performed by the algorithm. threshold : int (default = 0) Threshold that regulates the substitution condition; chromosomes_count: int (default = 10) number of chromosomes used to find the optimal solution. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.ssma import SSMA >>> import numpy as np >>> X = np.array([[i] for i in range(100)]) >>> y = np.asarray(50 * [0] + 50 * [1]) >>> ssma = SSMA() >>> ssma.fit(X, y) SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0) >>> print ssma.predict([[40],[60]]) [0 1] >>> print ssma.reduction_ 0.98 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype selection based on a steady-state memetic algorithm: a study of scalability. Memetic Computing, 2(3):183–199, 2010. """ def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors) def accuracy(self, chromosome, X, y): mask = np.asarray(chromosome, dtype=bool) cX, cy = X[mask], y[mask] #print len(cX), len(cy), sum(chromosome) self.classifier.fit(cX, cy) labels = self.classifier.predict(X) accuracy = (labels == y).sum() return float(accuracy)/len(y) def fitness(self, chromosome, X, y): #TODO add the possibility of use AUC for factor1 ac = self.accuracy(chromosome, X, y) rd = 1.0 - (float(sum(chromosome))/len(chromosome)) return self.alpha * ac + (1.0 - self.alpha) * rd def fitness_gain(self, gain, n): return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n) def update_threshold(self, X, y): best_index = np.argmax(self.evaluations) chromosome = self.chromosomes[best_index] best_ac = self.accuracy(chromosome, X, y) best_rd = 1.0 - float(sum(chromosome))/len(y) if best_ac <= self.best_chromosome_ac: self.threshold = self.threshold + 1 if best_rd <= self.best_chromosome_rd: self.threshold = self.threshold - 1 self.best_chromosome_ac = best_ac self.best_chromosome_rd = best_rd def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U def memetic_looper(self, S, R): c = 0 for i in range(len(S)): if S[i] == 1 and i not in R: c = c + 1 if c == 2: return True return False def memetic_select_j(self, S, R): indexs = [] for i in range(len(S)): if i not in R and S[i] == 1: indexs.append(i) # if list is empty wlil return error return np.random.choice(indexs) def generate_population(self, X, y): self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))] for c in range(self.chromosomes_count)] self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes] self.update_threshold(X, y) def select_parents(self, X, y): parents = [] for i in range(2): samples = random.sample(self.chromosomes, 2) parents = parents + [samples[0] if self.fitness(samples[0], X, y) > self.fitness(samples[1], X, y) else samples[1]] return np.array(parents, copy=True) def crossover(self, parent_1, parent_2): size = len(parent_1) mask = [0] * (size/2) + [1] * (size - size/2) mask = np.asarray(mask, dtype=bool) np.random.shuffle(mask) off_1 = parent_1 * mask + parent_2 * ~mask off_2 = parent_2 * mask + parent_1 * ~mask return np.asarray([off_1, off_2]) def mutation(self, offspring): for i in range(len(offspring)): if np.random.uniform(0,1) < 1.0/len(offspring): offspring[i] = not offspring[i] return offspring def memetic_search(self, chromosome, X, y, chromosome_fitness = None): S = np.array(chromosome, copy=True) if S.sum() == 0: return S, 0 if chromosome_fitness == None: chromosome_fitness = self.fitness(chromosome, X, y) fitness_s = chromosome_fitness # List of visited genes in S R = [] # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i U = self.index_nearest_neighbor(S, X, y) while self.memetic_looper(S, R): j = self.memetic_select_j(S, R) S[j] = 0 gain = 0.0 U_copy = list(U) mask = np.asarray(S, dtype=bool) X_tra, y_tra = X[mask], y[mask] real_idx = np.asarray(range(len(y)))[mask] if len(y_tra) > 0: for i in range(len(U)): if U[i] == j: self.classifier.fit(X_tra, y_tra) [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1, return_distance=False) U[i] = real_idx[idx] if y[i] == y[U_copy[i]] and y[i] != y[U[i]]: gain = gain - 1.0 if y[i] != y[U_copy[i]] and y[i] == y[U[i]]: gain = gain + 1.0 if gain >= self.threshold: n = S.sum() g = self.fitness_gain(gain, n) fitness_s = fitness_s + g R = [] else: U = U_copy S[j] = 1 R.append(j) return list(S), fitness_s def main_loop(self, X, y): self.generate_population(X, y) n, worse_fit_index = 0, -1 while (n < self.max_loop): parents = self.select_parents(X, y) offspring = self.crossover(parents[0], parents[1]) offspring[0] = self.mutation(offspring[0]) offspring[1] = self.mutation(offspring[1]) fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring] if worse_fit_index == -1: worse_fit_index = np.argmin(self.evaluations) for i in range(len(offspring)): p_ls = 1.0 if fit_offs[i] == -1: p_ls = -1 if fit_offs[i] <= self.evaluations[worse_fit_index]: p_ls = 0.0625 if np.random.uniform(0,1) < p_ls: offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i]) for i in range(len(offspring)): if fit_offs[i] > self.evaluations[worse_fit_index]: self.chromosomes[worse_fit_index] = offspring[i] self.evaluations[worse_fit_index] = fit_offs[i] worse_fit_index = np.argmin(self.evaluations) n = n + 1 if n % 10 == 0: self.update_threshold(X, y) def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
class TomekLinks(InstanceReductionMixin): """Tomek Links. The Tomek Links algorithm removes a pair instances that forms a Tomek Link. This techniques removes instances in the decision region. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default in the classification (only). The Tomek Links uses only n_neighbors=1 in the reduction. keep_class : int, optional (default = None) Label of the class to not be removed in the tomek links. If None, it removes all nodes of the links. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.tomek_links import TomekLinks >>> import numpy as np >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]]) >>> y = np.array([1,1,2,1,2,2,2,1,2,2]) >>> tl = TomekLinks() >>> tl.fit(X, y) TomekLinks(keep_class=None) >>> print tl.predict([[2.5],[7.5]]) [1, 2] >>> print tl.reduction_ 0.4 See also -------- protopy.selection.enn.ENN: edited nearest neighbor References ---------- I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems, Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976. """ def __init__(self, n_neighbors=3, keep_class=None): self.n_neighbors = n_neighbors self.classifier = None self.keep_class = keep_class def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [ nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0]) ] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y == self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_