Ejemplo n.º 1
0
def knn_classification(k, dist_func, X_train, Y_train, X_predict):
    (m_examples, n_dimensions) = X_train.shape

    # use kd tree structure for knn searching
    labelled_points = np.append(X_train,
                                Y_train.reshape(m_examples, 1),
                                axis=1)
    t = KDTree.build_tree(labelled_points, n_dimensions)

    # store results in the predictions vector
    Y_predict = np.empty(X_predict.shape[0])

    # record the number of points searched for benchmark/comparison purposes
    total_points_searched = 0

    # perform knn search for each test data
    for i, x in enumerate(X_predict):
        (labelled_nearest_neighbors, _, search_space_size) = \
                KDTree.knn_search(t, x, k, n_dimensions, dist_func)

        # nearest neighbor labels are the last column
        nearest_neighbors_labels = np.array(labelled_nearest_neighbors)[:, -1]
        Y_predict[i] = mode_with_random_tie_breaking(nearest_neighbors_labels)
        total_points_searched += search_space_size

    return Y_predict
Ejemplo n.º 2
0
class KNeighborsBase(object):

    def __init__(self):
        self.k_neighbors = None
        self.tree = None

    def fit(self, X, y, k_neighbors=3):
        self.k_neighbors = k_neighbors
        self.tree = KDTree()
        self.tree.build_tree(X, y)

    # 1.获取kd_Tree
    # 2.建立大顶堆
    # 3.建立队列
    # 4.外层循环更新大顶堆
    # 5.内层循环遍历kd_Tree
    # 6.满足堆顶是第k近邻时退出循环

    def knn_search(self, Xi):
        tree = self.tree
        heap = MaxHeap(self.k_neighbors, lambda x: x.dist)
        # 搜索Xi时,从根节点到叶节点的路径
        nd = tree.search(Xi, tree.root)
        # 初始化队列
        que = [(tree.root, nd)]
        while que:
            # 计算Xi和根节点的距离
            nd_root, nd_cur = que.pop(0)
            nd_root.dist = tree.get_eu_dist(Xi, nd_root)
            heap.add(nd_root)
            while nd_cur is not nd_root:
                # 计算Xi和当前节点的距离
                nd_cur.dist = tree.get_eu_dist(Xi, nd_cur)
                # 更新最好的节点和距离
                heap.add(nd_cur)
                if nd_cur.brother and (not heap or heap.items[0].dist > tree.get_hyper_plane_dist(Xi, nd_cur.father)):
                    _nd = tree.search(Xi, nd_cur.brother)
                    que.append((nd_cur.brother, _nd))
                nd_cur = nd_cur.father

        return heap

    def _predict(self, Xi):
        return NotImplemented

    def predict(self, X):
        return [self._predict(Xi) for Xi in X]
Ejemplo n.º 3
0
def one_vs_all_knn_classification(k, dist_func, X_train, Y_train, X_predict):
    (m_examples, n_dimensions) = X_train.shape

    # use kd tree structure for knn searching
    train_indices = (np.arange(0, m_examples)).reshape(m_examples, 1)
    indexed_points = np.append(X_train, train_indices, axis=1)
    t = KDTree.build_tree(indexed_points, n_dimensions)

    # store results in the predictions vector
    Y_predict = np.empty(X_predict.shape[0])

    # perform knn search for each test data
    for i, x in enumerate(X_predict):
        indexed_nearest_neighbors = \
                KDTree.knn_search(t, x, k, n_dimensions, dist_func)[0]

        # http://en.wikipedia.org/wiki/Multiclass_classification
        # use one-vs-all strategy to predict the label
        possible_labels = set(Y_train)  # supposing that each class has at \
        # least one representative ...
        zero_based_indexed_integer_labels = range(0, len(possible_labels))
        assert possible_labels.issubset(zero_based_indexed_integer_labels), \
               "accept only zero-based indexed, integer labels"

        # the predicted label will be the one from the classifier that gives
        # the most votes, so store the votes in a table
        classifier_votes_tab = {
            c: 0
            for c in zero_based_indexed_integer_labels
        }

        for c in zero_based_indexed_integer_labels:
            Y_c = np.zeros(m_examples)
            Y_c[Y_train == c] = 1

            # neighbor indices are the last column
            nearest_neighbors_indices = np.array(indexed_nearest_neighbors)[:,
                                                                            -1]
            votes = int(sum(Y_c[nearest_neighbors_indices.astype(int)]))
            classifier_votes_tab[c] = votes

        flattened_table = list(Counter(classifier_votes_tab).elements())
        Y_predict[i] = mode_with_random_tie_breaking(flattened_table)

    return Y_predict
Ejemplo n.º 4
0
def one_vs_all_knn_classification(k, dist_func, X_train, Y_train, X_predict):
    (m_examples, n_dimensions) = X_train.shape

    # use kd tree structure for knn searching
    train_indices = (np.arange(0, m_examples)).reshape(m_examples, 1)
    indexed_points = np.append(X_train, train_indices, axis=1)
    t = KDTree.build_tree(indexed_points, n_dimensions)

    # store results in the predictions vector
    Y_predict = np.empty(X_predict.shape[0])
 
    # perform knn search for each test data
    for i, x in enumerate(X_predict):
        indexed_nearest_neighbors = \
                KDTree.knn_search(t, x, k, n_dimensions, dist_func)[0]
 
        # http://en.wikipedia.org/wiki/Multiclass_classification
        # use one-vs-all strategy to predict the label
        possible_labels = set(Y_train) # supposing that each class has at \
                                       # least one representative ...
        zero_based_indexed_integer_labels = range(0, len(possible_labels))
        assert possible_labels.issubset(zero_based_indexed_integer_labels), \
               "accept only zero-based indexed, integer labels"

        # the predicted label will be the one from the classifier that gives
        # the most votes, so store the votes in a table
        classifier_votes_tab = {c: 0 for c in zero_based_indexed_integer_labels}

        for c in zero_based_indexed_integer_labels:
            Y_c = np.zeros(m_examples)
            Y_c[Y_train == c] = 1
            
            # neighbor indices are the last column
            nearest_neighbors_indices= np.array(indexed_nearest_neighbors)[:,-1]
            votes = int(sum(Y_c[nearest_neighbors_indices.astype(int)]))
            classifier_votes_tab[c] = votes

        flattened_table = list(Counter(classifier_votes_tab).elements())
        Y_predict[i] = mode_with_random_tie_breaking(flattened_table)

    return Y_predict
Ejemplo n.º 5
0
def knn_classification(k, dist_func, X_train, Y_train, X_predict):
    (m_examples, n_dimensions) = X_train.shape

    # use kd tree structure for knn searching
    labelled_points = np.append(X_train, Y_train.reshape(m_examples,1),axis=1)
    t = KDTree.build_tree(labelled_points, n_dimensions)

    # store results in the predictions vector
    Y_predict = np.empty(X_predict.shape[0])

    # record the number of points searched for benchmark/comparison purposes
    total_points_searched = 0

    # perform knn search for each test data
    for i, x in enumerate(X_predict):
        (labelled_nearest_neighbors, _, search_space_size) = \
                KDTree.knn_search(t, x, k, n_dimensions, dist_func)

        # nearest neighbor labels are the last column
        nearest_neighbors_labels = np.array(labelled_nearest_neighbors)[:,-1]
        Y_predict[i] = mode_with_random_tie_breaking(nearest_neighbors_labels)
        total_points_searched += search_space_size
        
    return Y_predict