def test_ball_tree_KDE(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) bt = BallTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.001, 0.01, 0.1]: dens_true = compute_kernel_slow(Y, X, kernel, h) def check_results(kernel, h, atol, rtol, dualtree, breadth_first): dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=dualtree, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=rtol) for rtol in [0, 1E-5]: for atol in [1E-10, 1E-5, 0.1]: for dualtree in (True, False): if dualtree and rtol > 0: continue for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, dualtree, breadth_first)
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30): print("Ball Tree") X = np.random.random((N, D)).astype(DTYPE) t0 = time() btskl = skBallTree(X, leaf_size=leaf_size) t1 = time() bt = BallTree(X, leaf_size=leaf_size) t2 = time() print("Build:") print(" sklearn : %.2g sec" % (t1 - t0)) print(" new : %.2g sec" % (t2 - t1)) t0 = time() Dskl, Iskl = btskl.query(X, k) t1 = time() dist = [Dskl] ind = [Iskl] times = [t1 - t0] labels = ['sklearn'] counts = [-1] for dualtree in (False, True): for breadth_first in (False, True): bt.reset_n_calls() t0 = time() D, I = bt.query(X, k, dualtree=dualtree, breadth_first=breadth_first) t1 = time() dist.append(D) ind.append(I) times.append(t1 - t0) counts.append(bt.get_n_calls()) if dualtree: label = 'dual/' else: label = 'single/' if breadth_first: label += 'breadthfirst' else: label += 'depthfirst' labels.append(label) print("Query:") for lab, t, c in zip(labels, times, counts): print(" %s : %.2g sec (%i calls)" % (lab, t, c)) print print( " distances match: %s" % ', '.join([ '%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist)) ])) print( " indices match: %s" % ', '.join( ['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
def _check_p_distance_vs_KDT(self, p): bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(self.X, leafsize=10) dist_bt, ind_bt = bt.query(self.X, k=5) dist_kd, ind_kd = kdt.query(self.X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_query(): X = np.random.random(size=(100, 5)) for k in (2, 4, 6): bt = BallTree(X) kdt = cKDTree(X) dist_bt, ind_bt = bt.query(X, k=k) dist_kd, ind_kd = kdt.query(X, k=k) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_p_distance(): X = np.random.random(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = BallTree(X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_allclose(dist1, dist2)
def main(): check('nbrs.radius_neighbors(p, radius)', setup_str_bt) check('nbrs.radius_neighbors(p, radius)', setup_str_bf) n, d = 1000, 3 X = np.random.rand(n, d) p = X[0] radius = 0.4 ball_tree_inds = BallTree(X).radius_neighbors(p, radius) brute_force_inds = BruteForce(X).radius_neighbors(p, radius) print(ball_tree_inds == brute_force_inds)
def _check_metrics_float(self, k, metric, kwargs): bt = BallTree(self.X, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.X, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.pdist(self.X, squareform=True) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm] # we don't check the indices here because if there is a tie for # nearest neighbor, then the test may fail. Distances will reflect # whether the search was successful assert_array_almost_equal(dist_bt, dist_dm)
def _check_metrics_bool(self, k, metric, kwargs): bt = BallTree(self.Xbool, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.Ybool, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.cdist(self.Ybool, self.Xbool) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm] # we don't check the indices here because there are very often # ties for nearest neighbors, which cause the test to fail. # Distances will be correct in either case. assert_array_almost_equal(dist_bt, dist_dm)
def test_ball_tree_query_radius_count(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def test_query_radius_count(self): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def test_ball_tree_two_point(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) r = np.linspace(0, 1, 10) bt = BallTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_allclose(counts, counts_true) for dualtree in (True, False): yield check_two_point, r, dualtree
def test_query_radius_indices(self, n_queries=20): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.cdist(X[:n_queries], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:n_queries], r, return_distance=False) ind2 = np.zeros(D.shape) + np.arange(D.shape[1]) ind = np.concatenate(map(np.sort, ind)) ind2 = ind2[D <= r] assert_array_almost_equal(ind, ind2)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_array_almost_equal(d, dist)
def test_ball_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_query_radius_distance(self): # center the data X = 2 * self.X - 1 # choose a query point near the origin query_pt = 0.01 * X[:1] eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) # compute reference distances dm = DistanceMetric() dist_true = dm.cdist(query_pt, X)[0] dist_true.sort() for r in np.linspace(dist_true[0], dist_true[-1], 10): yield (self._check_query_radius_distance, X, bt, query_pt, dist_true, r, eps)
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.cdist(X[:10], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:10], r, return_distance=False) for i in range(10): ind1 = ind[i] ind2 = np.where(D[i] <= r)[0] ind1.sort() ind2.sort() assert_array_almost_equal(ind1, ind2)
def __init__(self, training_data_size_ratio: float, k: int = 5): # Split dataset into training and testing data self.k = k self.dataset = prepare_data().values np.random.shuffle(self.dataset) size = int(len(self.dataset) * training_data_size_ratio) # Get the labels(unique values situated on the last column in the dataset) self.classes = set(self.dataset[:, -1]) result_col = self.dataset.shape[1] - 1 self.training_data = Classifier.create_classes(self.dataset[:size], result_col, self.classes) self.test_data = Classifier.create_classes(self.dataset[size:], result_col, self.classes) # build the trees for each class in the training set self.training_trees = dict( (class_, BallTree(data, euclid_metric)) for class_, data in self.training_data.items())
def active_select(self): # generate ball tree for query variables idx = np.array(range(self.Q.shape[0])) Qtree = BallTree(self.Q, self.leaf_size, idx) # for each data point x , # find minimum distance of set of query as min_dist(x) # count the number of query which has l2 distance from x <= min_dist(x)+self.delta max_count_global = 0 # contains globally maximum number of query within specified range over all data points max_query_x_id = 0 # id of datapoint which posses maximum number of query within bound as specified # iterate over all data points for x, id in zip(self.Xtrain, range(self.Xtrain.shape[0])): # each data point maintains following list of distances of querypoints where the distance values are within # a threshold of minimum distance self.dist_list = [] self.min_dist = float('inf') self.upper_b = self.min_dist + self.delta # updates above two variables self.max_query(x, Qtree, depth=0) # Count number of query within bound for x count = len(self.dist_list) if count > max_count_global: max_count_global = count max_query_x_id = id return max_query_x_id
if len(heap) > k: heap.pop() for candidate in heap: # print(candidate) x_, y_ = candidate[0] plt.plot(x_, y_, 'bo', color='pink') print(distances) all = True for candidate in heap: if not candidate[1] in s: all = False break print('All found in the brute force approach? %s' % all) tree = BallTree(points, euclid_metric) distance_balls = knn(tree, point, k, euclid_metric) # print(len(distance_balls)) # print(distance_balls) all = True for candidate in distance_balls: x, y = candidate[0] plt.plot(x, y, 'bo', color='#00ff00') if not candidate[1] in s: all = False break print('All found in the ball tree approach? %s' % all) # traverse(tree, plt) plt.show()
def test_pickle(self): bt1 = BallTree(self.X, leaf_size=1) ind1, dist1 = bt1.query(self.X) for protocol in (0, 1, 2): yield (self._check_pickle, protocol, bt1, ind1, dist1)
from time import time import numpy as np from ball_tree import BallTree X = np.random.random((10000, 3)) t0 = time() BT = BallTree(X, 30) t1 = time() print "construction: %.2g sec" % (t1 - t0) for k in [1, 2, 4, 8]: for dual in (False, True): t0 = time() BT.query(X, k, dualtree=dual) t1 = time() if dual: dual_str = ' (dual)' else: dual_str = '' print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0], X.shape[1], dual_str, t1 - t0) for r in 0.1, 0.3, 0.5: t0 = time() BT.query_radius(X[:1000], r) t1 = time() print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1], t1 - t0)
def test_query_knn(self): bt = BallTree(self.X) kdt = cKDTree(self.X) for k in (1, 2, 4, 8, 16): for dualtree in [True, False]: yield (self._check_query_knn, bt, kdt, k, dualtree)
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_allclose(dist1, dist2)
def bench_KDE(N=1000, D=3, h=0.5, leaf_size=30): X = np.random.random((N, D)) bt = BallTree(X, leaf_size=leaf_size) kernel = 'gaussian' print "Kernel Density:" atol = 1E-5 rtol = 1E-5 for h in [0.001, 0.01, 0.1]: t0 = time() dens_true = np.exp(-0.5 * ((X[:, None, :] - X)**2).sum(-1) / h**2).sum(-1) dens_true /= h * np.sqrt(2 * np.pi) t1 = time() bt.reset_n_calls() t2 = time() dens1 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=True) t3 = time() n1 = bt.get_n_calls() bt.reset_n_calls() t4 = time() dens2 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=False) t5 = time() n2 = bt.get_n_calls() bt.reset_n_calls() t6 = time() dens3 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=True) t7 = time() n3 = bt.get_n_calls() bt.reset_n_calls() t8 = time() dens4 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=False) t9 = time() n4 = bt.get_n_calls() print " h = %.3f" % h print " brute force: %.2g sec (%i calls)" % (t1 - t0, N * N) print(" single tree (depth first): %.2g sec (%i calls)" % (t3 - t2, n1)) print(" single tree (breadth first): %.2g sec (%i calls)" % (t5 - t4, n2)) print(" dual tree: (depth first) %.2g sec (%i calls)" % (t7 - t6, n3)) print(" dual tree: (breadth first) %.2g sec (%i calls)" % (t9 - t8, n4)) print " distances match:", (np.allclose(dens_true, dens1, atol=atol, rtol=rtol), np.allclose(dens_true, dens2, atol=atol, rtol=rtol), np.allclose(dens_true, dens3, atol=atol), np.allclose(dens_true, dens4, atol=atol))
rseed = np.random.randint(100000) print "rseed = %i" % rseed np.random.seed(rseed) X = np.random.random((200, 3)) Y = np.random.random((100, 3)) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1, n1 = SBT.query(Y, 3) t1 = time() print "python: %.2g sec" % (t1 - t0) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1a, n1a = SBT.query_dual(Y, 3) t1 = time() print "python dual: %.2g sec" % (t1 - t0) t0 = time() BT = BallTree(X, leaf_size=10) d2, n2 = BT.query(Y, 3) t1 = time() print "cython: %.2g sec" % (t1 - t0) print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1) print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
def create_ball_tree(self): # done idx = np.array(range(self.Xtrain.shape[0])) self.tree = BallTree(self.Xtrain, self.leaf_size, idx)