def plotDecisionBoundry(X, y, y_predicted, modelName): X_Train_embedded = TSNE(n_components=2).fit_transform(X) print(X_Train_embedded.shape) # create meshgrid resolution = 1000 # 100x100 background pixels X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max( X_Train_embedded[:, 0]) X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max( X_Train_embedded[:, 1]) xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( X_Train_embedded, y_predicted) voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoiBackground = voronoiBackground.reshape((resolution, resolution)) #plot plt.contourf(xx, yy, voronoiBackground) plt.scatter(X_Train_embedded[:, 0], X_Train_embedded[:, 1], c=y.values.flatten()) plt.title(modelName) plt.show()
def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5): train_data = np.array(train_data).squeeze() train_label = np.array(train_label).squeeze() test_data = np.array(test_data).squeeze() if test_label is not None: test_label = np.array(test_label).squeeze() if not adjust_parameters: knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8) knn.fit(train_data, train_label) predicted_label = knn.predict(test_data) if test_label is not None: acc = accuracy_score(test_label, predicted_label) print 'acc is ', acc return predicted_label else: max_acc = 0.0 max_k = 0 max_predicted = None for k in range(1, 11): knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8) knn.fit(train_data, train_label) predicted_label = knn.predict(test_data) acc = accuracy_score(test_label, predicted_label) if acc > max_acc: max_acc = acc max_k = k max_predicted = predicted_label print 'k = ', k, ' acc is ', acc print 'max acc is ', max_acc, ' responding to k is ', max_k return max_predicted, max_k
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [ nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0]) ] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y == self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class DCS(object): @abstractmethod def select(self, ensemble, x): pass def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn == None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted def get_neighbors(self, x, return_distance=False): # obtain the K nearest neighbors of test sample in the validation set if not return_distance: [idx] = self.knn.kneighbors(x, return_distance=return_distance) else: [dists], [idx] = self.knn.kneighbors(x, return_distance=return_distance) X_nn = self.Xval[idx] # k neighbors y_nn = self.yval[idx] # k neighbors target if return_distance: return X_nn, y_nn, dists else: return X_nn, y_nn
class RawModel: def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean') def fit(self, trainExamples): X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) ) Y = [x.Y for x in trainExamples] self.model.fit(X, Y) return self def predict(self, examples): X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) ) return self.model.predict( X )
def predict(self, X, n_neighbors=1): """Perform classification on an array of test vectors X. The predicted class C for each sample in X is returned. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Notes ----- The default prediction is using KNeighborsClassifier, if the instance reducition algorithm is to be performed with another classifier, it should be explicited overwritten and explained in the documentation. """ X = check_array(X) if not hasattr(self, "X_") or self.X_ is None: raise AttributeError("Model has not been trained yet.") if not hasattr(self, "y_") or self.y_ is None: raise AttributeError("Model has not been trained yet.") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) self.classifier.fit(self.X_, self.y_) return self.classifier.predict(X)
class DCS(object): @abstractmethod def select(self, ensemble, x): pass def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn is None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted def get_neighbors(self, x, return_distance=False): # obtain the K nearest neighbors of test sample in the validation set if not return_distance: [idx] = self.knn.kneighbors(x, return_distance=return_distance) else: rd = return_distance [dists], [idx] = self.knn.kneighbors(x, return_distance=rd) X_nn = self.Xval[idx] # k neighbors y_nn = self.yval[idx] # k neighbors target if return_distance: return X_nn, y_nn, dists else: return X_nn, y_nn
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() self._merge() self._pruning() self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def KNN_method(X, y): skf = StratifiedKFold(n_splits=4, random_state=42) skf.get_n_splits(X, y) for train_index, test_index in skf.split(X, y): print("Train:", train_index, "Validation:", test_index) trainX, testX = X[train_index], X[test_index] trainY, testY = y[train_index], y[test_index] #here starts KNN #how many neighbours want to use in the KNC kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90] dist = ['manhattan', 'euclidean', 'chebyshev'] results = {} for element in dist: accuracy_results = [] for k in kvalues: knn = KNeighborsClassifier(n_neighbors=k, metric=element) knn.fit(trainX, trainY) predictedY = knn.predict(testX) accuracy_results.append(accuracy_score(testY, predictedY)) results[element] = accuracy_results print("Results of model preparation for: " + str(results)) plt.figure() multiple_line_chart(plt.gca(), kvalues, results, 'KNN variants', 'n', 'accuracy', percentage=True) plt.show()
def _pruning(self): if len(self.groups) < 2: return self.groups pruned, fst = False, True knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute') while pruned or fst: index = 0 pruned, fst = False, False while index < len(self.groups): group = self.groups[index] mask = np.ones(len(self.groups), dtype=bool) mask[index] = False reps_x = np.asarray([g.rep_x for g in self.groups])[mask] reps_y = np.asarray([g.label for g in self.groups])[mask] labels = knn.fit(reps_x, reps_y).predict(group.X) if (labels == group.label).all(): self.groups.remove(group) pruned = True else: index = index + 1 if len(self.groups) == 1: index = len(self.groups) pruned = False return self.groups
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 return self.X_, self.y_ mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def _main_loop(self): exit_count = 0 knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute') while exit_count < len(self.groups): index, exit_count = 0, 0 while index < len(self.groups): group = self.groups[index] reps_x = np.asarray([g.rep_x for g in self.groups]) reps_y = np.asarray([g.label for g in self.groups]) knn.fit(reps_x, reps_y) nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False) nn_idx = nn_idx.T[0] mask = nn_idx == index # if all are correctly classified if not (False in mask): exit_count = exit_count + 1 # if all are misclasified elif not (group.label in reps_y[nn_idx]): pca = PCA(n_components=1) pca.fit(group.X) # maybe use a 'for' instead of creating array d = pca.transform(reps_x[index]) dis = [pca.transform(inst)[0] for inst in group.X] mask_split = (dis < d).flatten() new_X = group.X[mask_split] self.groups.append(_Group(new_X, group.label)) group.X = group.X[~mask_split] elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any(): mask_mv = nn_idx != index index_mv = np.asarray(range(len(group)))[mask_mv] X_mv = group.remove_instances(index_mv) G_mv = nn_idx[mask_mv] for x, g in zip(X_mv, G_mv): self.groups[g].add_instances([x]) elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis: mask_mv = reps_y[nn_idx] != group.label new_X = group.X[mask_mv] self.groups.append(_Group(new_X, group.label)) group.X = group.X[~mask_mv] else: exit_count = exit_count + 1 if len(group) == 0: self.groups.remove(group) else: index = index + 1 for g in self.groups: g.update_all() return self.groups
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1): knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute') knn.fit(Xtra, ytra) y_true = ytst y_pred = knn.predict(Xtst) return evaluate_results(y_true, y_pred, positive_label=positive_label)
def __init__(self, csv_path_train, csv_path_test, k): ''' Constructor ''' self.csv_path_train = csv_path_train self.csv_path_test = csv_path_test self.classifier = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski')
def knn(X, y, model_path): model = KNeighborsClassifier() model.fit(X, y) print(model) #预测 expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn is None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted
def plot_boundaries_decision(X, y, clf, namefile): """ Method to plot the boundaries decision of our data X : A numpy array of the data we want to plot y : A numpy array of the label corresponding to our data clf : the model use to predict the label of our data namefile : the name of the file in which we want to save the figure """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42) # #The plot of boundary decision in the 2D space of representation of data model.fit(X_train, y_train) # create meshgrid resolution = 100 # 100x100 background pixels X2d_xmin, X2d_xmax = np.min(X[:, 0]), np.max(X[:, 0]) X2d_ymin, X2d_ymax = np.min(X[:, 1]), np.max(X[:, 1]) xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit(X, y) voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoiBackground = voronoiBackground.reshape((resolution, resolution)) fig = pyplot.figure() fig.set_size_inches(10.5, 8.5) ax = fig.add_subplot(211) #small subplot to show how the legend has moved. #plot ax.contourf(xx, yy, voronoiBackground) ax.set_title( " Boundaries decision in using the dimensionality reduction of Multidimensional scaling" ) ax.scatter(X[:, 0], X[:, 1], c=color[y].tolist()) label = numpy.array([x for x in ["Apple", "Tomatoes"]]) # Legend for ind, s in enumerate(label): ax.scatter([], [], label=s, color=color[ind]) pyplot.legend(scatterpoints=1, frameon=True, labelspacing=0.5, bbox_to_anchor=(1.2, .4), loc='center right') pyplot.tight_layout() pyplot.savefig(namefile) pyplot.show()
def get_best_k(X, y, max_k=30, keep_best_n=10, weights=None): # TODO: check X, y. description # Set default values if max_k is None: max_k = len(X) if weights is None: weights = ['uniform', 'distance'] # Make weights into a list if it is not already one if type(weights) is not list: weights = [weights] # Check if inputs are valid check_pandas_dataframe_nd(X, 'X') check_numpy_array_pandas_dataframe_series_1d(y, 'y') check_list_of_strings(weights, 'weights') check_integer(max_k, 'max_k') check_larger(max_k, 'max_k', 1) check_integer(keep_best_n, 'keep_best_n') check_larger(keep_best_n, 'keep_best_n', 1) # Change shape of y if necessary y = np.array(y) y = y.ravel() # Split into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Get value for max_k max_k = min(max_k, len(X_test)) # Set up results-list best_model = [] for k in range(1, max_k): for weight in weights: model = KNeighborsClassifier(n_neighbors=k, weights=weight).fit(X_train, y_train) score = model.score(X_test, y_test) best_model.append((k, weight, score)) best_model.sort(key=lambda x: x[2], reverse=True) best_model = best_model[0:keep_best_n] return best_model
def __plot_decision_boundaries(X, y, y_pred, resolution: int = 100, embedding=None): if embedding is None: embedding = TSNE(n_components=2, random_state=160290).fit_transform(X) x_min, x_max = safe_bounds(embedding[:, 0]) y_min, y_max = safe_bounds(embedding[:, 1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( embedding, y_pred) voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoi_bg = voronoi_bg.reshape((resolution, resolution)) mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis") points = hv.Scatter( { "x": embedding[:, 0], "y": embedding[:, 1], "pred": y_pred, "class": y }, kdims=["x", "y"], vdims=["pred", "class"], ) errors = y_pred != y failed_points = hv.Scatter( { "x": embedding[errors, 0], "y": embedding[errors, 1] }, kdims=["x", "y"]).opts(color="red", size=5, alpha=0.9) points = points.opts(color="pred", cmap="viridis", line_color="grey", size=10, alpha=0.8, tools=["hover"]) plot = mesh * points * failed_points plot = plot.opts(xaxis=None, yaxis=None, width=500, height=450, title="Decision boundaries on TSNE") return plot
def get_result(self): # file opener tkinter.Tk().withdraw() directory = filedialog.askdirectory() result = self.read_emails_from_directory(directory) train_labels = np.zeros(1430) train_labels[715:1430] = 1 # This equates to 1-715 = HAM and 716-1430 = SPAM # If you change result[n] to something else # Make sure you change the same result down # down in line 251 (test_matrix) train_matrix = self.extract_features(directory, result[0]) #print(train_matrix) # print("body words:", result[0]) # print("\n\nsubject words:", result[1]) # print("\n\nbody phrases:", result[2]) # print("\n\nsubject phrases:", result[3]) print("body words:", len(result[0])) print("subject words:", len(result[1])) print("body phrases:", len(result[2])) print("subject phrases:", len(result[3])) model1 = MultinomialNB() model2 = LinearSVC() model3 = RandomForestClassifier() model4 = KNeighborsClassifier() model1.fit(train_matrix, train_labels) model2.fit(train_matrix, train_labels) model3.fit(train_matrix, train_labels) model4.fit(train_matrix, train_labels) test_dir = filedialog.askdirectory() # Here -----v test_matrix = self.extract_features(test_dir, result[0]) test_labels = np.zeros(600) # This equates to 1-300 = HAM and 301-600 = SPAM test_labels[300:600] = 1 result1 = model1.predict(test_matrix) result2 = model2.predict(test_matrix) result3 = model3.predict(test_matrix) result4 = model4.predict(test_matrix) print(confusion_matrix(test_labels, result1)) print(confusion_matrix(test_labels, result2)) print(confusion_matrix(test_labels, result3)) print(confusion_matrix(test_labels, result4)) return result
def knn_builder(): pip_knn = Pipeline([("selector",SelectKBest(chi2)),("knn_clf",KNeighborsClassifier())]) parameters_knn ={'selector__k':[20], "knn_clf__n_neighbors":[1]} scorer_knn = make_scorer(accuracy_score) searcher_knn = GridSearchCV(pip_knn, parameters_knn, scoring=scorer_knn) return searcher_knn
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes minority_class = self.pos_class if self.pos_class == None: minority_class = min(set(y), key = list(y).count) # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() min_groups = filter(lambda g: g.label == minority_class, self.groups) self._merge() self._pruning() max_groups = filter(lambda g: g.label != minority_class, self.groups) self.groups = min_groups + max_groups self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def __init__(self, estimator=KNeighborsClassifier(n_neighbors=10), dimensionality_reduction=PCA(n_components=2), acceptance_threshold=0.03, n_decision_boundary_keypoints=60, n_connecting_keypoints=None, n_interpolated_keypoints=None, n_generated_testpoints_per_keypoint=15, linear_iteration_budget=100, hypersphere_iteration_budget=300, verbose=True): if acceptance_threshold == 0: raise Warning( "A nonzero acceptance threshold is strongly recommended so the optimizer can finish in finite time") if linear_iteration_budget < 2 or hypersphere_iteration_budget < 2: raise Exception("Invalid iteration budget") self.classifier = estimator self.dimensionality_reduction = dimensionality_reduction self.acceptance_threshold = acceptance_threshold if n_decision_boundary_keypoints and n_connecting_keypoints and n_interpolated_keypoints and n_connecting_keypoints + n_interpolated_keypoints != n_decision_boundary_keypoints: raise Exception( "n_connecting_keypoints and n_interpolated_keypoints must sum to n_decision_boundary_keypoints (set them to None to use calculated suggestions)") self.n_connecting_keypoints = n_connecting_keypoints if n_connecting_keypoints != None else n_decision_boundary_keypoints / 3 self.n_interpolated_keypoints = n_interpolated_keypoints if n_interpolated_keypoints != None else n_decision_boundary_keypoints * 2 / 3 self.linear_iteration_budget = linear_iteration_budget self.n_generated_testpoints_per_keypoint = n_generated_testpoints_per_keypoint self.hypersphere_iteration_budget = hypersphere_iteration_budget self.verbose = verbose self.decision_boundary_points = [] self.decision_boundary_points_2d = [] self.X_testpoints = [] self.y_testpoints = [] self.background = [] self.steps = 3 self.hypersphere_max_retry_budget = 20 self.penalties_enabled = True self.random_gap_selection = False
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def predict(self, X, n_neighbors=1): """Perform classification on an array of test vectors X. The predicted class C for each sample in X is returned. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Notes ----- The default prediction is using KNeighborsClassifier, if the instance reducition algorithm is to be performed with another classifier, it should be explicited overwritten and explained in the documentation. """ X = atleast2d_or_csr(X) if not hasattr(self, "X_") or self.X_ is None: raise AttributeError("Model has not been trained yet.") if not hasattr(self, "y_") or self.y_ is None: raise AttributeError("Model has not been trained yet.") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) self.classifier.fit(self.X_, self.y_) return self.classifier.predict(X)
def get_gating(dss, tsf_name, use_gating=UseGating.TREE, *args, **kwargs): from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors.classification import KNeighborsClassifier component_scale = [1, 0.2] # TODO this is specific to coordinate transform to slice just the body frame reaction force # input_slice = slice(3, None) input_slice = None if use_gating is UseGating.MLP: gating = gating_function.MLPSelector(dss, *args, **kwargs, name=tsf_name, input_slice=input_slice) elif use_gating is UseGating.KDE: gating = gating_function.KDESelector(dss, component_scale=component_scale, input_slice=input_slice) elif use_gating is UseGating.GMM: opts = {'n_components': 10, } if kwargs is not None: opts.update(kwargs) gating = gating_function.GMMSelector(dss, gmm_opts=opts, variational=True, component_scale=component_scale, input_slice=input_slice) elif use_gating is UseGating.TREE: gating = gating_function.SklearnClassifierSelector(dss, DecisionTreeClassifier(**kwargs), input_slice=input_slice) elif use_gating is UseGating.FORCE: gating = gating_function.ReactionForceHeuristicSelector(12, slice(3, None)) elif use_gating is UseGating.MLP_SKLEARN: gating = gating_function.SklearnClassifierSelector(dss, MLPClassifier(**kwargs), input_slice=input_slice) elif use_gating is UseGating.KNN: gating = gating_function.SklearnClassifierSelector(dss, KNeighborsClassifier(n_neighbors=1, **kwargs), input_slice=input_slice) else: raise RuntimeError("Unrecognized selector option") return gating
def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U
def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): self._hyperparams = { 'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm, 'leaf_size': leaf_size, 'p': p, 'metric': metric, 'metric_params': metric_params, 'n_jobs': n_jobs} self._wrapped_model = Op(**self._hyperparams)
def __init__(self, Xval, yval, K=5, weighted=False, knn=None): self.Xval = Xval self.yval = yval self.K = K if knn == None: self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute') else: self.knn = knn self.knn.fit(Xval, yval) self.weighted = weighted
def run_knn_multi_level_classifier(train, train_labels): k_range = list(range(2, 5)) k_scores = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, train, train_labels, cv=10, scoring='accuracy') k_scores.append(scores.mean()) return k_scores
def setclassifier(self, estimator=KNeighborsClassifier(n_neighbors=10)): """Assign classifier for which decision boundary should be plotted. Parameters ---------- estimator : BaseEstimator instance, optional (default=KNeighborsClassifier(n_neighbors=10)). Classifier for which the decision boundary should be plotted. Must have probability estimates enabled (i.e. estimator.predict_proba must work). Make sure it is possible for probability estimates to get close to 0.5 (more specifically, as close as specified by acceptance_threshold). """ self.classifier = estimator
def compute_cnn(X, y): "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries." classifier = KNeighborsClassifier(n_neighbors=3) prots_s = [] labels_s = [] classes = np.unique(y) classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] classifier.fit(prots_s, labels_s) X_ = np.asarray(prots_s) y_ = np.asarray(labels_s) reduction_ = 1.0 - float(len(y_)/len(y)) print reduction_
def build_and_test_model(classifier, X, Y, Z, param): accuracies = [] ari = [] for train, test in LeaveOneOut().split(X): X_train, Y_train = X[train], Y[train] X_test, Y_test, Z_test = X[test], Y[test], Z[test] predicted = None if classifier == "KNN": neigh = KNeighborsClassifier(n_neighbors=param).fit( X_train, Y_train) predicted = neigh.predict(X_test) elif classifier == "RF": clf = RandomForestClassifier(n_estimators=param, random_state=0) # ,max_depth=2, clf.fit(X_train, Y_train) predicted = clf.predict(X_test) elif classifier == "SVM": clf = svm.SVC(gamma='scale') clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "NAIVE": clf = GaussianNB() clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "RANDOM": options = list(set(Y_train)) predicted = [random.choice(options) for _ in range(len(Y_test))] accuracies.append(metrics.accuracy_score(Y_test, predicted)) ari.append(metrics.adjusted_rand_score(Z_test, predicted)) return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
class PatchedRawModel: def __init__(self): self.baseModel = RawModel() self.model49 = KNeighborsClassifier(n_neighbors=10) self.model35 = KNeighborsClassifier(n_neighbors=10) def fit(self, trainExamples): self.baseModel.fit(trainExamples) X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] ) Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]] self.model49.fit(X49, Y49) X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] ) Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]] self.model35.fit(X35, Y35) def predict(self, examples): basePredictions = self.baseModel.predict(examples) for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))): if y in [4, 9]: specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT))) if specializedPrediction != y: basePredictions[i] = specializedPrediction elif y in [3, 5]: specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT))) if specializedPrediction != y: basePredictions[i] = specializedPrediction return basePredictions
def compute_enn(X, y): """ the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples """ classifier = KNeighborsClassifier(n_neighbors=3) classes = np.unique(y) classes_ = classes mask = np.zeros(y.size, dtype=bool) classifier.fit(X, y) for i in xrange(y.size): sample, label = X[i], y[i] if classifier.predict(sample) == [label]: mask[i] = not mask[i] X_ = np.asarray(X[mask]) y_ = np.asarray(y[mask]) reduction_ = 1.0 - float(len(y_)) / len(y) print reduction_
def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)
def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y==self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def __init__(self): self.baseModel = RawModel() self.model49 = KNeighborsClassifier(n_neighbors=10) self.model35 = KNeighborsClassifier(n_neighbors=10)
class ENN(InstanceReductionMixin): """Edited Nearest Neighbors. The Edited Nearest Neighbors removes the instances in de boundaries, maintaining redudant samples. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.enn import ENN >>> import numpy as np >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]]) >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2]) >>> editednn = ENN() >>> editednn.fit(X, y) ENN(n_neighbors=3) >>> print(editednn.predict([[-0.6, 0.6]])) [1] >>> print editednn.reduction_ 0.75 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest neighbor rule. JCP, 6(7):1493–1500, 2011. """ def __init__(self, n_neighbors=3): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class SSMA(InstanceReductionMixin): """Steady State Memetic Algorithm The Steady-State Memetic Algorithm is an evolutionary prototype selection algorithm. It uses a memetic algorithm in order to perform a local search in the code. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. alpha : float (default = 0.6) Parameter that ponderates the fitness function. max_loop : int (default = 1000) Number of maximum loops performed by the algorithm. threshold : int (default = 0) Threshold that regulates the substitution condition; chromosomes_count: int (default = 10) number of chromosomes used to find the optimal solution. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.ssma import SSMA >>> import numpy as np >>> X = np.array([[i] for i in range(100)]) >>> y = np.asarray(50 * [0] + 50 * [1]) >>> ssma = SSMA() >>> ssma.fit(X, y) SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0) >>> print ssma.predict([[40],[60]]) [0 1] >>> print ssma.reduction_ 0.98 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype selection based on a steady-state memetic algorithm: a study of scalability. Memetic Computing, 2(3):183–199, 2010. """ def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors) def accuracy(self, chromosome, X, y): mask = np.asarray(chromosome, dtype=bool) cX, cy = X[mask], y[mask] #print len(cX), len(cy), sum(chromosome) self.classifier.fit(cX, cy) labels = self.classifier.predict(X) accuracy = (labels == y).sum() return float(accuracy)/len(y) def fitness(self, chromosome, X, y): #TODO add the possibility of use AUC for factor1 ac = self.accuracy(chromosome, X, y) rd = 1.0 - (float(sum(chromosome))/len(chromosome)) return self.alpha * ac + (1.0 - self.alpha) * rd def fitness_gain(self, gain, n): return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n) def update_threshold(self, X, y): best_index = np.argmax(self.evaluations) chromosome = self.chromosomes[best_index] best_ac = self.accuracy(chromosome, X, y) best_rd = 1.0 - float(sum(chromosome))/len(y) if best_ac <= self.best_chromosome_ac: self.threshold = self.threshold + 1 if best_rd <= self.best_chromosome_rd: self.threshold = self.threshold - 1 self.best_chromosome_ac = best_ac self.best_chromosome_rd = best_rd def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U def memetic_looper(self, S, R): c = 0 for i in range(len(S)): if S[i] == 1 and i not in R: c = c + 1 if c == 2: return True return False def memetic_select_j(self, S, R): indexs = [] for i in range(len(S)): if i not in R and S[i] == 1: indexs.append(i) # if list is empty wlil return error return np.random.choice(indexs) def generate_population(self, X, y): self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))] for c in range(self.chromosomes_count)] self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes] self.update_threshold(X, y) def select_parents(self, X, y): parents = [] for i in range(2): samples = random.sample(self.chromosomes, 2) parents = parents + [samples[0] if self.fitness(samples[0], X, y) > self.fitness(samples[1], X, y) else samples[1]] return np.array(parents, copy=True) def crossover(self, parent_1, parent_2): size = len(parent_1) mask = [0] * (size/2) + [1] * (size - size/2) mask = np.asarray(mask, dtype=bool) np.random.shuffle(mask) off_1 = parent_1 * mask + parent_2 * ~mask off_2 = parent_2 * mask + parent_1 * ~mask return np.asarray([off_1, off_2]) def mutation(self, offspring): for i in range(len(offspring)): if np.random.uniform(0,1) < 1.0/len(offspring): offspring[i] = not offspring[i] return offspring def memetic_search(self, chromosome, X, y, chromosome_fitness = None): S = np.array(chromosome, copy=True) if S.sum() == 0: return S, 0 if chromosome_fitness == None: chromosome_fitness = self.fitness(chromosome, X, y) fitness_s = chromosome_fitness # List of visited genes in S R = [] # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i U = self.index_nearest_neighbor(S, X, y) while self.memetic_looper(S, R): j = self.memetic_select_j(S, R) S[j] = 0 gain = 0.0 U_copy = list(U) mask = np.asarray(S, dtype=bool) X_tra, y_tra = X[mask], y[mask] real_idx = np.asarray(range(len(y)))[mask] if len(y_tra) > 0: for i in range(len(U)): if U[i] == j: self.classifier.fit(X_tra, y_tra) [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1, return_distance=False) U[i] = real_idx[idx] if y[i] == y[U_copy[i]] and y[i] != y[U[i]]: gain = gain - 1.0 if y[i] != y[U_copy[i]] and y[i] == y[U[i]]: gain = gain + 1.0 if gain >= self.threshold: n = S.sum() g = self.fitness_gain(gain, n) fitness_s = fitness_s + g R = [] else: U = U_copy S[j] = 1 R.append(j) return list(S), fitness_s def main_loop(self, X, y): self.generate_population(X, y) n, worse_fit_index = 0, -1 while (n < self.max_loop): parents = self.select_parents(X, y) offspring = self.crossover(parents[0], parents[1]) offspring[0] = self.mutation(offspring[0]) offspring[1] = self.mutation(offspring[1]) fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring] if worse_fit_index == -1: worse_fit_index = np.argmin(self.evaluations) for i in range(len(offspring)): p_ls = 1.0 if fit_offs[i] == -1: p_ls = -1 if fit_offs[i] <= self.evaluations[worse_fit_index]: p_ls = 0.0625 if np.random.uniform(0,1) < p_ls: offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i]) for i in range(len(offspring)): if fit_offs[i] > self.evaluations[worse_fit_index]: self.chromosomes[worse_fit_index] = offspring[i] self.evaluations[worse_fit_index] = fit_offs[i] worse_fit_index = np.argmin(self.evaluations) n = n + 1 if n % 10 == 0: self.update_threshold(X, y) def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin): """Mixin class for all instance reduction techniques""" def set_classifier(self): """Sets the classified to be used in the instance reduction process and classification. Parameters ---------- classifier : classifier, following the KNeighborsClassifier style (default = KNN) y : array-like, shape = [n_samples] Labels for X. Returns ------- P : array-like, shape = [indeterminated, n_features] Resulting training set. q : array-like, shape = [indertaminated] Labels for P """ self.classifier = classifier def reduce_data(self, X, y): """Perform the instance reduction procedure on the given training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training set.0 y : array-like, shape = [n_samples] Labels for X. Returns ------- X_ : array-like, shape = [indeterminated, n_features] Resulting training set. y_ : array-like, shape = [indertaminated] Labels for X_ """ pass def get_prototypes(self): return self.X_, self.y_ def fit(self, X, y, reduce_data=True): """ Fit the InstanceReduction model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) reduce_data : bool, flag indicating if the reduction would be performed """ self.X = X self.y = y self.labels = set(y) self.prototypes = None self.prototypes_labels = None self.reduction_ratio = 0.0 if reduce_data: self.reduce_data(X, y) return self def predict(self, X, n_neighbors=1): """Perform classification on an array of test vectors X. The predicted class C for each sample in X is returned. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Notes ----- The default prediction is using KNeighborsClassifier, if the instance reducition algorithm is to be performed with another classifier, it should be explicited overwritten and explained in the documentation. """ X = atleast2d_or_csr(X) if not hasattr(self, "X_") or self.X_ is None: raise AttributeError("Model has not been trained yet.") if not hasattr(self, "y_") or self.y_ is None: raise AttributeError("Model has not been trained yet.") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) self.classifier.fit(self.X_, self.y_) return self.classifier.predict(X) def predict_proba(self, X): """Return probability estimates for the test data X. after a given prototype selection algorithm. Parameters ---------- X : array, shape = (n_samples, n_features) A 2-D array representing the test points. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ self.classifier.fit(self.X_, self.y_) return self.classifier.predict_proba(X)
class TomekLinks(InstanceReductionMixin): """Tomek Links. The Tomek Links algorithm removes a pair instances that forms a Tomek Link. This techniques removes instances in the decision region. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default in the classification (only). The Tomek Links uses only n_neighbors=1 in the reduction. keep_class : int, optional (default = None) Label of the class to not be removed in the tomek links. If None, it removes all nodes of the links. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.tomek_links import TomekLinks >>> import numpy as np >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]]) >>> y = np.array([1,1,2,1,2,2,2,1,2,2]) >>> tl = TomekLinks() >>> tl.fit(X, y) TomekLinks(keep_class=None) >>> print tl.predict([[2.5],[7.5]]) [1, 2] >>> print tl.reduction_ 0.4 See also -------- protopy.selection.enn.ENN: edited nearest neighbor References ---------- I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems, Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976. """ def __init__(self, n_neighbors=3, keep_class=None): self.n_neighbors = n_neighbors self.classifier = None self.keep_class = keep_class def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y==self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class SGP2(SGP): """Self-Generating Prototypes 2 The Self-Generating Prototypes 2 is the second version of the Self-Generating Prototypes algorithm. It has a higher generalization power, including the procedures merge and pruning. Parameters ---------- r_min: float, optional (default = 0.0) Determine the minimum size of a cluster [0.00, 0.20] r_mis: float, optional (default = 0.0) Determine the error tolerance before split a group Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.generation.sgp import SGP2 >>> import numpy as np >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])] >>> X = np.asarray(X).T >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1]) >>> sgp2 = SGP2() >>> sgp2.fit(X, y) SGP2(r_min=0.0, r_mis=0.0) >>> print sgp2.reduction_ 0.5 See also -------- protopy.generation.SGP: self-generating prototypes protopy.generation.sgp.ASGP: adaptive self-generating prototypes References ---------- Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007. """ def __init__(self, r_min=0.0, r_mis=0.0): self.groups = None self.r_min = r_min self.r_mis = r_mis self.n_neighbors = 1 self.classifier = None self.groups = None def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() self._merge() self._pruning() self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_ def _merge(self): if len(self.groups) < 2: return self.groups merged = False for group in self.groups: reps_x = np.asarray([g.rep_x for g in self.groups]) reps_y = np.asarray([g.label for g in self.groups]) self.classifier.fit(reps_x, reps_y) nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False) nn2_idx = nn2_idx.T[1] # could use a threshold if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label: ng_group = self.groups[nn2_idx[0]] ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False) ng2_idx = ng2_idx.T[1] if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group: group.add_instances(ng_group.X, update=True) self.groups.remove(ng_group) merged = True if merged: self._merge() return self.groups def _pruning(self): if len(self.groups) < 2: return self.groups pruned, fst = False, True knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute') while pruned or fst: index = 0 pruned, fst = False, False while index < len(self.groups): group = self.groups[index] mask = np.ones(len(self.groups), dtype=bool) mask[index] = False reps_x = np.asarray([g.rep_x for g in self.groups])[mask] reps_y = np.asarray([g.label for g in self.groups])[mask] labels = knn.fit(reps_x, reps_y).predict(group.X) if (labels == group.label).all(): self.groups.remove(group) pruned = True else: index = index + 1 if len(self.groups) == 1: index = len(self.groups) pruned = False return self.groups
class CNN(InstanceReductionMixin): """Condensed Nearest Neighbors. Each class is represented by a set of prototypes, with test samples classified to the class with the nearest prototype. The Condensed Nearest Neighbors removes the redundant instances, maintaining the samples in the decision boundaries. Parameters ---------- n_neighbors : int, optional (default = 1) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `prototypes_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `labels_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.cnn import CNN >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> y = np.array([1, 1, 1, 2, 2, 2]) >>> cnn = CNN() >>> cnn.fit(X, y) CNN(n_neighbors=1) >>> print(cnn.predict([[-0.8, -1]])) [1] See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier Notes ----- The Condensed Nearest Neighbor is one the first prototype selection technique in literature. References ---------- P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on Information Theory 14 (1968) 515–516. """ def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def nearest_fit(X,y): clf = KNeighborsClassifier(7, 'distance') return clf.fit(X, y)
def knn_score(X, y, neighbors): knn5 = KNeighborsClassifier(n_neighbors=neighbors) knn5.fit(X, y) y_pred = knn5.predict(X) print "KNN{} accuracy_score: {}".format(neighbors, metrics.accuracy_score(y, y_pred))