def knn_classification(k, dist_func, X_train, Y_train, X_predict): (m_examples, n_dimensions) = X_train.shape # use kd tree structure for knn searching labelled_points = np.append(X_train, Y_train.reshape(m_examples, 1), axis=1) t = KDTree.build_tree(labelled_points, n_dimensions) # store results in the predictions vector Y_predict = np.empty(X_predict.shape[0]) # record the number of points searched for benchmark/comparison purposes total_points_searched = 0 # perform knn search for each test data for i, x in enumerate(X_predict): (labelled_nearest_neighbors, _, search_space_size) = \ KDTree.knn_search(t, x, k, n_dimensions, dist_func) # nearest neighbor labels are the last column nearest_neighbors_labels = np.array(labelled_nearest_neighbors)[:, -1] Y_predict[i] = mode_with_random_tie_breaking(nearest_neighbors_labels) total_points_searched += search_space_size return Y_predict
class KNeighborsBase(object): def __init__(self): self.k_neighbors = None self.tree = None def fit(self, X, y, k_neighbors=3): self.k_neighbors = k_neighbors self.tree = KDTree() self.tree.build_tree(X, y) # 1.获取kd_Tree # 2.建立大顶堆 # 3.建立队列 # 4.外层循环更新大顶堆 # 5.内层循环遍历kd_Tree # 6.满足堆顶是第k近邻时退出循环 def knn_search(self, Xi): tree = self.tree heap = MaxHeap(self.k_neighbors, lambda x: x.dist) # 搜索Xi时,从根节点到叶节点的路径 nd = tree.search(Xi, tree.root) # 初始化队列 que = [(tree.root, nd)] while que: # 计算Xi和根节点的距离 nd_root, nd_cur = que.pop(0) nd_root.dist = tree.get_eu_dist(Xi, nd_root) heap.add(nd_root) while nd_cur is not nd_root: # 计算Xi和当前节点的距离 nd_cur.dist = tree.get_eu_dist(Xi, nd_cur) # 更新最好的节点和距离 heap.add(nd_cur) if nd_cur.brother and (not heap or heap.items[0].dist > tree.get_hyper_plane_dist(Xi, nd_cur.father)): _nd = tree.search(Xi, nd_cur.brother) que.append((nd_cur.brother, _nd)) nd_cur = nd_cur.father return heap def _predict(self, Xi): return NotImplemented def predict(self, X): return [self._predict(Xi) for Xi in X]
def one_vs_all_knn_classification(k, dist_func, X_train, Y_train, X_predict): (m_examples, n_dimensions) = X_train.shape # use kd tree structure for knn searching train_indices = (np.arange(0, m_examples)).reshape(m_examples, 1) indexed_points = np.append(X_train, train_indices, axis=1) t = KDTree.build_tree(indexed_points, n_dimensions) # store results in the predictions vector Y_predict = np.empty(X_predict.shape[0]) # perform knn search for each test data for i, x in enumerate(X_predict): indexed_nearest_neighbors = \ KDTree.knn_search(t, x, k, n_dimensions, dist_func)[0] # http://en.wikipedia.org/wiki/Multiclass_classification # use one-vs-all strategy to predict the label possible_labels = set(Y_train) # supposing that each class has at \ # least one representative ... zero_based_indexed_integer_labels = range(0, len(possible_labels)) assert possible_labels.issubset(zero_based_indexed_integer_labels), \ "accept only zero-based indexed, integer labels" # the predicted label will be the one from the classifier that gives # the most votes, so store the votes in a table classifier_votes_tab = { c: 0 for c in zero_based_indexed_integer_labels } for c in zero_based_indexed_integer_labels: Y_c = np.zeros(m_examples) Y_c[Y_train == c] = 1 # neighbor indices are the last column nearest_neighbors_indices = np.array(indexed_nearest_neighbors)[:, -1] votes = int(sum(Y_c[nearest_neighbors_indices.astype(int)])) classifier_votes_tab[c] = votes flattened_table = list(Counter(classifier_votes_tab).elements()) Y_predict[i] = mode_with_random_tie_breaking(flattened_table) return Y_predict
def one_vs_all_knn_classification(k, dist_func, X_train, Y_train, X_predict): (m_examples, n_dimensions) = X_train.shape # use kd tree structure for knn searching train_indices = (np.arange(0, m_examples)).reshape(m_examples, 1) indexed_points = np.append(X_train, train_indices, axis=1) t = KDTree.build_tree(indexed_points, n_dimensions) # store results in the predictions vector Y_predict = np.empty(X_predict.shape[0]) # perform knn search for each test data for i, x in enumerate(X_predict): indexed_nearest_neighbors = \ KDTree.knn_search(t, x, k, n_dimensions, dist_func)[0] # http://en.wikipedia.org/wiki/Multiclass_classification # use one-vs-all strategy to predict the label possible_labels = set(Y_train) # supposing that each class has at \ # least one representative ... zero_based_indexed_integer_labels = range(0, len(possible_labels)) assert possible_labels.issubset(zero_based_indexed_integer_labels), \ "accept only zero-based indexed, integer labels" # the predicted label will be the one from the classifier that gives # the most votes, so store the votes in a table classifier_votes_tab = {c: 0 for c in zero_based_indexed_integer_labels} for c in zero_based_indexed_integer_labels: Y_c = np.zeros(m_examples) Y_c[Y_train == c] = 1 # neighbor indices are the last column nearest_neighbors_indices= np.array(indexed_nearest_neighbors)[:,-1] votes = int(sum(Y_c[nearest_neighbors_indices.astype(int)])) classifier_votes_tab[c] = votes flattened_table = list(Counter(classifier_votes_tab).elements()) Y_predict[i] = mode_with_random_tie_breaking(flattened_table) return Y_predict
def knn_classification(k, dist_func, X_train, Y_train, X_predict): (m_examples, n_dimensions) = X_train.shape # use kd tree structure for knn searching labelled_points = np.append(X_train, Y_train.reshape(m_examples,1),axis=1) t = KDTree.build_tree(labelled_points, n_dimensions) # store results in the predictions vector Y_predict = np.empty(X_predict.shape[0]) # record the number of points searched for benchmark/comparison purposes total_points_searched = 0 # perform knn search for each test data for i, x in enumerate(X_predict): (labelled_nearest_neighbors, _, search_space_size) = \ KDTree.knn_search(t, x, k, n_dimensions, dist_func) # nearest neighbor labels are the last column nearest_neighbors_labels = np.array(labelled_nearest_neighbors)[:,-1] Y_predict[i] = mode_with_random_tie_breaking(nearest_neighbors_labels) total_points_searched += search_space_size return Y_predict