def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30): print("Ball Tree") X = np.random.random((N, D)).astype(DTYPE) t0 = time() btskl = skBallTree(X, leaf_size=leaf_size) t1 = time() bt = BallTree(X, leaf_size=leaf_size) t2 = time() print("Build:") print(" sklearn : %.2g sec" % (t1 - t0)) print(" new : %.2g sec" % (t2 - t1)) t0 = time() Dskl, Iskl = btskl.query(X, k) t1 = time() dist = [Dskl] ind = [Iskl] times = [t1 - t0] labels = ['sklearn'] counts = [-1] for dualtree in (False, True): for breadth_first in (False, True): bt.reset_n_calls() t0 = time() D, I = bt.query(X, k, dualtree=dualtree, breadth_first=breadth_first) t1 = time() dist.append(D) ind.append(I) times.append(t1 - t0) counts.append(bt.get_n_calls()) if dualtree: label = 'dual/' else: label = 'single/' if breadth_first: label += 'breadthfirst' else: label += 'depthfirst' labels.append(label) print("Query:") for lab, t, c in zip(labels, times, counts): print(" %s : %.2g sec (%i calls)" % (lab, t, c)) print print( " distances match: %s" % ', '.join([ '%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist)) ])) print( " indices match: %s" % ', '.join( ['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
def _check_p_distance_vs_KDT(self, p): bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(self.X, leafsize=10) dist_bt, ind_bt = bt.query(self.X, k=5) dist_kd, ind_kd = kdt.query(self.X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_allclose(dist1, dist2)
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30): print("Ball Tree") X = np.random.random((N, D)).astype(DTYPE) t0 = time() btskl = skBallTree(X, leaf_size=leaf_size) t1 = time() bt = BallTree(X, leaf_size=leaf_size) t2 = time() print("Build:") print(" sklearn : %.2g sec" % (t1 - t0)) print(" new : %.2g sec" % (t2 - t1)) t0 = time() Dskl, Iskl = btskl.query(X, k) t1 = time() dist = [Dskl] ind = [Iskl] times = [t1 - t0] labels = ['sklearn'] counts = [-1] for dualtree in (False, True): for breadth_first in (False, True): bt.reset_n_calls() t0 = time() D, I = bt.query(X, k, dualtree=dualtree, breadth_first=breadth_first) t1 = time() dist.append(D) ind.append(I) times.append(t1 - t0) counts.append(bt.get_n_calls()) if dualtree: label = 'dual/' else: label = 'single/' if breadth_first: label += 'breadthfirst' else: label += 'depthfirst' labels.append(label) print("Query:") for lab, t, c in zip(labels, times, counts): print(" %s : %.2g sec (%i calls)" % (lab, t, c)) print print(" distances match: %s" % ', '.join(['%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist))])) print(" indices match: %s" % ', '.join(['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
def test_ball_tree_query(): X = np.random.random(size=(100, 5)) for k in (2, 4, 6): bt = BallTree(X) kdt = cKDTree(X) dist_bt, ind_bt = bt.query(X, k=k) dist_kd, ind_kd = kdt.query(X, k=k) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_p_distance(): X = np.random.random(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = BallTree(X, leaf_size=10, metric="minkowski", p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_p_distance(): X = np.random.random(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = BallTree(X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def _check_metrics_bool(self, k, metric, kwargs): bt = BallTree(self.Xbool, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.Ybool, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.cdist(self.Ybool, self.Xbool) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm] # we don't check the indices here because there are very often # ties for nearest neighbors, which cause the test to fail. # Distances will be correct in either case. assert_array_almost_equal(dist_bt, dist_dm)
def _check_metrics_float(self, k, metric, kwargs): bt = BallTree(self.X, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.X, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.pdist(self.X, squareform=True) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm] # we don't check the indices here because if there is a tie for # nearest neighbor, then the test may fail. Distances will reflect # whether the search was successful assert_array_almost_equal(dist_bt, dist_dm)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def kneighbors_graph(X, n_neighbors, weight=None, ball_tree=None, window_size=1): """Computes the (weighted) graph of k-Neighbors Parameters ---------- X : array-like, shape = [n_samples, n_features] Coordinates of samples. One sample per row. n_neighbors : int Number of neighbors for each sample. weight : None (default) Weights to apply on graph edges. If weight is None then no weighting is applied (1 for each edge). If weight equals "distance" the edge weight is the euclidian distance. If weight equals "barycenter" the weights are barycenter weights estimated by solving a linear system for each point. ball_tree : None or instance of precomputed BallTree window_size : int Window size pass to the BallTree Returns ------- A : sparse matrix, shape = [n_samples, n_samples] A is returned as LInked List Sparse matrix A[i,j] = weight of edge that connects i to j Examples -------- >>> X = [[0], [2], [1]] >>> from scikits.learn.neighbors import kneighbors_graph >>> A = kneighbors_graph(X, 2) >>> A.todense() matrix([[ 1., 0., 1.], [ 0., 1., 1.], [ 0., 1., 1.]]) """ from scipy import sparse X = np.asanyarray(X) n_samples = X.shape[0] if ball_tree is None: ball_tree = BallTree(X, window_size) A = sparse.lil_matrix((n_samples, ball_tree.size)) dist, ind = ball_tree.query(X, k=n_neighbors) if weight is None: for i, li in enumerate(ind): if n_neighbors > 1: A[i, list(li)] = np.ones(n_neighbors) else: A[i, li] = 1.0 elif weight is "distance": for i, li in enumerate(ind): if n_neighbors > 1: A[i, list(li)] = dist[i, :] else: A[i, li] = dist[i, 0] elif weight is "barycenter": # XXX : the next loop could be done in parallel # by parallelizing groups of indices for i, li in enumerate(ind): if n_neighbors > 1: X_i = ball_tree.data[li] A[i, list(li)] = barycenter_weights(X[i], X_i) else: A[i, li] = 1.0 else: raise ValueError("Unknown weight type") return A
rseed = np.random.randint(100000) print "rseed = %i" % rseed np.random.seed(rseed) X = np.random.random((200, 3)) Y = np.random.random((100, 3)) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1, n1 = SBT.query(Y, 3) t1 = time() print "python: %.2g sec" % (t1 - t0) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1a, n1a = SBT.query_dual(Y, 3) t1 = time() print "python dual: %.2g sec" % (t1 - t0) t0 = time() BT = BallTree(X, leaf_size=10) d2, n2 = BT.query(Y, 3) t1 = time() print "cython: %.2g sec" % (t1 - t0) print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1) print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
from time import time import numpy as np from ball_tree import BallTree X = np.random.random((10000, 3)) t0 = time() BT = BallTree(X, 30) t1 = time() print "construction: %.2g sec" % (t1 - t0) for k in [1, 2, 4, 8]: for dual in (False, True): t0 = time() BT.query(X, k, dualtree=dual) t1 = time() if dual: dual_str = ' (dual)' else: dual_str = '' print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0], X.shape[1], dual_str, t1 - t0) for r in 0.1, 0.3, 0.5: t0 = time() BT.query_radius(X[:1000], r) t1 = time() print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1], t1 - t0)
class Neighbors(BaseEstimator, ClassifierMixin): """Classifier implementing k-Nearest Neighbor Algorithm. Parameters ---------- data : array-like, shape (n, k) The data points to be indexed. This array is not copied, and so modifying this data will result in bogus results. labels : array An array representing labels for the data (only arrays of integers are supported). n_neighbors : int default number of neighbors. window_size : int Window size passed to BallTree Examples -------- >>> samples = [[0.,0.,1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]] >>> labels = [0,0,1,1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=3) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=3, window_size=1) >>> print neigh.predict([[0,0,0]]) [ 0.] Notes ----- http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ def __init__(self, n_neighbors=5, window_size=1): """Internally uses the ball tree datastructure and algorithm for fast neighbors lookups on high dimensional datasets. """ self.n_neighbors = n_neighbors self.window_size = window_size def fit(self, X, Y=()): # we need Y to be an integer, because after we'll use it an index self.Y = np.asanyarray(Y, dtype=np.int) self.ball_tree = BallTree(X, self.window_size) return self def kneighbors(self, data, n_neighbors=None): """Finds the K-neighbors of a point. Parameters ---------- point : array-like The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point. ind : array Array representing the indices of the nearest points in the population matrix. Examples -------- In the following example, we construnct a Neighbors class from an array representing our data set and ask who's the closest point to [1,1,1] >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> labels = [0, 0, 1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=1) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=1, window_size=1) >>> print neigh.kneighbors([1., 1., 1.]) (array(0.5), array(2)) As you can see, it returns [0.5], and [2], which means that the element is at distance 0.5 and is the third element of samples (indexes start at 0). You can also query for multiple points: >>> print neigh.kneighbors([[0., 1., 0.], [1., 0., 1.]]) (array([ 0.5 , 1.11803399]), array([1, 2])) """ if n_neighbors is None: n_neighbors = self.n_neighbors return self.ball_tree.query(data, k=n_neighbors) def predict(self, T, n_neighbors=None): """Predict the class labels for the provided data. Parameters ---------- test: array A 2-D array representing the test point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- labels: array List of class labels (one for each data sample). Examples -------- >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> labels = [0, 0, 1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=1) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=1, window_size=1) >>> print neigh.predict([.2, .1, .2]) 0 >>> print neigh.predict([[0., -1., 0.], [3., 2., 0.]]) [0 1] """ T = np.asanyarray(T) if n_neighbors is None: n_neighbors = self.n_neighbors return _predict_from_BallTree(self.ball_tree, self.Y, T, n_neighbors)
def test_pickle(self): bt1 = BallTree(self.X, leaf_size=1) ind1, dist1 = bt1.query(self.X) for protocol in (0, 1, 2): yield (self._check_pickle, protocol, bt1, ind1, dist1)
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_allclose(dist1, dist2)
t0 = time() BT = BallTree(X, 30) t1 = time() print "BT construction: %.2g sec" % (t1 - t0) t0 = time() KDT = KDTree(X, 30) t1 = time() print "KDT construction: %.2g sec" % (t1 - t0) for k in 1, 2, 4, 8: print "\nquery %i in [%i, %i]:" % (k, X.shape[0], X.shape[1]) print " single dual" t0 = time() d1, i1 = BT.query(X_query, k, dualtree=False) t1 = time() d1, i1 = BT.query(X_query, k, dualtree=True) t2 = time() print " BT: %.3g sec %.3g sec" % (t1 - t0, t2 - t1) d2, i2 = KDT.query(X_query, k, dualtree=False) t3 = time() d2, i2 = KDT.query(X_query, k, dualtree=True) t4 = time() print " KDT: %.3g sec %.3g sec" % (t3 - t2, t4 - t3) print " (results match: %s)" % np.allclose(d1, d2) #for r in 0.1, 0.3, 0.5: # for tree in (BT, KDT): # t0 = time()