def _check_p_distance_vs_KDT(self, p): bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(self.X, leafsize=10) dist_bt, ind_bt = bt.query(self.X, k=5) dist_kd, ind_kd = kdt.query(self.X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30): print("Ball Tree") X = np.random.random((N, D)).astype(DTYPE) t0 = time() btskl = skBallTree(X, leaf_size=leaf_size) t1 = time() bt = BallTree(X, leaf_size=leaf_size) t2 = time() print("Build:") print(" sklearn : %.2g sec" % (t1 - t0)) print(" new : %.2g sec" % (t2 - t1)) t0 = time() Dskl, Iskl = btskl.query(X, k) t1 = time() dist = [Dskl] ind = [Iskl] times = [t1 - t0] labels = ['sklearn'] counts = [-1] for dualtree in (False, True): for breadth_first in (False, True): bt.reset_n_calls() t0 = time() D, I = bt.query(X, k, dualtree=dualtree, breadth_first=breadth_first) t1 = time() dist.append(D) ind.append(I) times.append(t1 - t0) counts.append(bt.get_n_calls()) if dualtree: label = 'dual/' else: label = 'single/' if breadth_first: label += 'breadthfirst' else: label += 'depthfirst' labels.append(label) print("Query:") for lab, t, c in zip(labels, times, counts): print(" %s : %.2g sec (%i calls)" % (lab, t, c)) print print(" distances match: %s" % ', '.join(['%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist))])) print(" indices match: %s" % ', '.join(['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_allclose(dist1, dist2)
def test_ball_tree_query(): X = np.random.random(size=(100, 5)) for k in (2, 4, 6): bt = BallTree(X) kdt = cKDTree(X) dist_bt, ind_bt = bt.query(X, k=k) dist_kd, ind_kd = kdt.query(X, k=k) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_p_distance(): X = np.random.random(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = BallTree(X, leaf_size=10, metric="minkowski", p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_p_distance(): X = np.random.random(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = BallTree(X, leaf_size=10, metric='minkowski', p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def test_ball_tree_query_radius_count(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def _check_metrics_float(self, k, metric, kwargs): bt = BallTree(self.X, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.X, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.pdist(self.X, squareform=True) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm] # we don't check the indices here because if there is a tie for # nearest neighbor, then the test may fail. Distances will reflect # whether the search was successful assert_array_almost_equal(dist_bt, dist_dm)
def _check_metrics_bool(self, k, metric, kwargs): bt = BallTree(self.Xbool, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.Ybool, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.cdist(self.Ybool, self.Xbool) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm] # we don't check the indices here because there are very often # ties for nearest neighbors, which cause the test to fail. # Distances will be correct in either case. assert_array_almost_equal(dist_bt, dist_dm)
def test_query_radius_count(self): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_query_radius_indices(self, n_queries=20): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.cdist(X[:n_queries], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:n_queries], r, return_distance=False) ind2 = np.zeros(D.shape) + np.arange(D.shape[1]) ind = np.concatenate(map(np.sort, ind)) ind2 = ind2[D <= r] assert_array_almost_equal(ind, ind2)
def test_ball_tree_KDE(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) bt = BallTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.001, 0.01, 0.1]: dens_true = compute_kernel_slow(Y, X, kernel, h) def check_results(kernel, h, atol, rtol, dualtree, breadth_first): dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=dualtree, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=rtol) for rtol in [0, 1E-5]: for atol in [1E-10, 1E-5, 0.1]: for dualtree in (True, False): if dualtree and rtol > 0: continue for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, dualtree, breadth_first)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_array_almost_equal(d, dist)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1e-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_array_almost_equal(d, dist)
def test_ball_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_ball_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.cdist(X[:10], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:10], r, return_distance=False) for i in range(10): ind1 = ind[i] ind2 = np.where(D[i] <= r)[0] ind1.sort() ind2.sort() assert_array_almost_equal(ind1, ind2)
def main(): check('nbrs.radius_neighbors(p, radius)', setup_str_bt) check('nbrs.radius_neighbors(p, radius)', setup_str_bf) n, d = 1000, 3 X = np.random.rand(n, d) p = X[0] radius = 0.4 ball_tree_inds = BallTree(X).radius_neighbors(p, radius) brute_force_inds = BruteForce(X).radius_neighbors(p, radius) print(ball_tree_inds == brute_force_inds)
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30): print("Ball Tree") X = np.random.random((N, D)).astype(DTYPE) t0 = time() btskl = skBallTree(X, leaf_size=leaf_size) t1 = time() bt = BallTree(X, leaf_size=leaf_size) t2 = time() print("Build:") print(" sklearn : %.2g sec" % (t1 - t0)) print(" new : %.2g sec" % (t2 - t1)) t0 = time() Dskl, Iskl = btskl.query(X, k) t1 = time() dist = [Dskl] ind = [Iskl] times = [t1 - t0] labels = ['sklearn'] counts = [-1] for dualtree in (False, True): for breadth_first in (False, True): bt.reset_n_calls() t0 = time() D, I = bt.query(X, k, dualtree=dualtree, breadth_first=breadth_first) t1 = time() dist.append(D) ind.append(I) times.append(t1 - t0) counts.append(bt.get_n_calls()) if dualtree: label = 'dual/' else: label = 'single/' if breadth_first: label += 'breadthfirst' else: label += 'depthfirst' labels.append(label) print("Query:") for lab, t, c in zip(labels, times, counts): print(" %s : %.2g sec (%i calls)" % (lab, t, c)) print print( " distances match: %s" % ', '.join([ '%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist)) ])) print( " indices match: %s" % ', '.join( ['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
def test_ball_tree_two_point(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) r = np.linspace(0, 1, 10) bt = BallTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_allclose(counts, counts_true) for dualtree in (True, False): yield check_two_point, r, dualtree
def test_query_radius_distance(self): # center the data X = 2 * self.X - 1 # choose a query point near the origin query_pt = 0.01 * X[:1] eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) # compute reference distances dm = DistanceMetric() dist_true = dm.cdist(query_pt, X)[0] dist_true.sort() for r in np.linspace(dist_true[0], dist_true[-1], 10): yield (self._check_query_radius_distance, X, bt, query_pt, dist_true, r, eps)
def __init__(self, training_data_size_ratio: float, k: int = 5): # Split dataset into training and testing data self.k = k self.dataset = prepare_data().values np.random.shuffle(self.dataset) size = int(len(self.dataset) * training_data_size_ratio) # Get the labels(unique values situated on the last column in the dataset) self.classes = set(self.dataset[:, -1]) result_col = self.dataset.shape[1] - 1 self.training_data = Classifier.create_classes(self.dataset[:size], result_col, self.classes) self.test_data = Classifier.create_classes(self.dataset[size:], result_col, self.classes) # build the trees for each class in the training set self.training_trees = dict( (class_, BallTree(data, euclid_metric)) for class_, data in self.training_data.items())
def active_select(self): # generate ball tree for query variables idx = np.array(range(self.Q.shape[0])) Qtree = BallTree(self.Q, self.leaf_size, idx) # for each data point x , # find minimum distance of set of query as min_dist(x) # count the number of query which has l2 distance from x <= min_dist(x)+self.delta max_count_global = 0 # contains globally maximum number of query within specified range over all data points max_query_x_id = 0 # id of datapoint which posses maximum number of query within bound as specified # iterate over all data points for x, id in zip(self.Xtrain, range(self.Xtrain.shape[0])): # each data point maintains following list of distances of querypoints where the distance values are within # a threshold of minimum distance self.dist_list = [] self.min_dist = float('inf') self.upper_b = self.min_dist + self.delta # updates above two variables self.max_query(x, Qtree, depth=0) # Count number of query within bound for x count = len(self.dist_list) if count > max_count_global: max_count_global = count max_query_x_id = id return max_query_x_id
def test_query_knn(self): bt = BallTree(self.X) kdt = cKDTree(self.X) for k in (1, 2, 4, 8, 16): for dualtree in [True, False]: yield (self._check_query_knn, bt, kdt, k, dualtree)
from time import time import numpy as np from ball_tree import BallTree X = np.random.random((10000, 3)) t0 = time() BT = BallTree(X, 30) t1 = time() print "construction: %.2g sec" % (t1 - t0) for k in [1, 2, 4, 8]: for dual in (False, True): t0 = time() BT.query(X, k, dualtree=dual) t1 = time() if dual: dual_str = ' (dual)' else: dual_str = '' print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0], X.shape[1], dual_str, t1 - t0) for r in 0.1, 0.3, 0.5: t0 = time() BT.query_radius(X[:1000], r) t1 = time() print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1], t1 - t0)
def test_pickle(self): bt1 = BallTree(self.X, leaf_size=1) ind1, dist1 = bt1.query(self.X) for protocol in (0, 1, 2): yield (self._check_pickle, protocol, bt1, ind1, dist1)
def bench_KDE(N=1000, D=3, h=0.5, leaf_size=30): X = np.random.random((N, D)) bt = BallTree(X, leaf_size=leaf_size) kernel = 'gaussian' print "Kernel Density:" atol = 1E-5 rtol = 1E-5 for h in [0.001, 0.01, 0.1]: t0 = time() dens_true = np.exp(-0.5 * ((X[:, None, :] - X) ** 2).sum(-1) / h ** 2).sum(-1) dens_true /= h * np.sqrt(2 * np.pi) t1 = time() bt.reset_n_calls() t2 = time() dens1 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=True) t3 = time() n1 = bt.get_n_calls() bt.reset_n_calls() t4 = time() dens2 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=False) t5 = time() n2 = bt.get_n_calls() bt.reset_n_calls() t6 = time() dens3 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=True) t7 = time() n3 = bt.get_n_calls() bt.reset_n_calls() t8 = time() dens4 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=False) t9 = time() n4 = bt.get_n_calls() print " h = %.3f" % h print " brute force: %.2g sec (%i calls)" % (t1 - t0, N * N) print(" single tree (depth first): %.2g sec (%i calls)" % (t3 - t2, n1)) print(" single tree (breadth first): %.2g sec (%i calls)" % (t5 - t4, n2)) print(" dual tree: (depth first) %.2g sec (%i calls)" % (t7 - t6, n3)) print(" dual tree: (breadth first) %.2g sec (%i calls)" % (t9 - t8, n4)) print " distances match:", (np.allclose(dens_true, dens1, atol=atol, rtol=rtol), np.allclose(dens_true, dens2, atol=atol, rtol=rtol), np.allclose(dens_true, dens3, atol=atol), np.allclose(dens_true, dens4, atol=atol))
def bench_KDE(N=1000, D=3, h=0.5, leaf_size=30): X = np.random.random((N, D)) bt = BallTree(X, leaf_size=leaf_size) kernel = 'gaussian' print "Kernel Density:" atol = 1E-5 rtol = 1E-5 for h in [0.001, 0.01, 0.1]: t0 = time() dens_true = np.exp(-0.5 * ((X[:, None, :] - X)**2).sum(-1) / h**2).sum(-1) dens_true /= h * np.sqrt(2 * np.pi) t1 = time() bt.reset_n_calls() t2 = time() dens1 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=True) t3 = time() n1 = bt.get_n_calls() bt.reset_n_calls() t4 = time() dens2 = bt.kernel_density(X, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=False, breadth_first=False) t5 = time() n2 = bt.get_n_calls() bt.reset_n_calls() t6 = time() dens3 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=True) t7 = time() n3 = bt.get_n_calls() bt.reset_n_calls() t8 = time() dens4 = bt.kernel_density(X, h, atol=atol, kernel=kernel, dualtree=True, breadth_first=False) t9 = time() n4 = bt.get_n_calls() print " h = %.3f" % h print " brute force: %.2g sec (%i calls)" % (t1 - t0, N * N) print(" single tree (depth first): %.2g sec (%i calls)" % (t3 - t2, n1)) print(" single tree (breadth first): %.2g sec (%i calls)" % (t5 - t4, n2)) print(" dual tree: (depth first) %.2g sec (%i calls)" % (t7 - t6, n3)) print(" dual tree: (breadth first) %.2g sec (%i calls)" % (t9 - t8, n4)) print " distances match:", (np.allclose(dens_true, dens1, atol=atol, rtol=rtol), np.allclose(dens_true, dens2, atol=atol, rtol=rtol), np.allclose(dens_true, dens3, atol=atol), np.allclose(dens_true, dens4, atol=atol))
class min_max_l2distance(learner): #curr_minmax=float('inf') #curr_winner=0 def __init__(self, fp, leaf_size): #print 'inside' learner.__init__(self, fp) self.active_method = 'minmax_l2distance' self.leaf_size = leaf_size self.max_min_dist = 0 self.max_min_point = 0 self.curr_minmax = float('inf') self.curr_winner = 0 self.curr_id = 0 self.complexity = 0 self.fcomplexity = self.fsave + 'complex' #self.bound=bound def create_ball_tree(self): # done idx = np.array(range(self.Xtrain.shape[0])) self.tree = BallTree(self.Xtrain, self.leaf_size, idx) def show_ball_tree_n_points(self): #------------------------------------------------------------ # Plot four different levels of the Ball tree X = self.Xtrain.toarray() fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(wspace=0.1, hspace=0.15, left=0.1, right=0.9, bottom=0.05, top=0.9) for level in range(4): ax = fig.add_subplot(2, 2, level, xticks=[], yticks=[]) #ax.scatter(X[:, 0], X[:, 1], s=9) self.tree.draw_circle(ax, depth=level) #ax.scatter(Q[:, 0], Q[:, 1], s=9, color='r') #BT.draw_circle(ax, depth=None) #ax.set_xlim(-1.35, 1.35) #ax.set_ylim(-1.0, 1.7) ax.set_title('level %i' % level) # suptitle() adds a title to the entire figure fig.suptitle('Ball-tree Example') plt.show() def load_data(self): learner.load_data(self) #print 'not calling learner load data' #print 'creating ball tree' self.create_ball_tree() def create_query_ball(self): # done q_center = np.array(self.Q.mean(0)) #print q_center """ plt.scatter(self.Q.toarray()[:,0], self.Q.toarray()[:,1], s=2) plt.scatter(q_center[0,0],q_center[0,1],c='r') plt.show() """ q_radius = 0 for i in range(self.Q.shape[0]): #print type(a) norm_val = LA.norm(self.Q.getrow(i).toarray() - q_center, 2) #print norm_val if norm_val > q_radius: q_radius = norm_val #print (self.Q-q_center)**2 #q_radius = np.sqrt(np.max(np.sum((self.Q - q_center) ** 2, 1))) return q_center, q_radius def get_bounds(self, q_center, q_radius, BT): #print BT.loc.shape #print q_center.shape min_dist = LA.norm(BT.loc - q_center) - (BT.radius + q_radius) max_dist = LA.norm(BT.loc - q_center) + (BT.radius + q_radius) maxmin_dist = min_dist #max( 0, LA.norm(BT.loc - q_center) - min(BT.radius, q_radius)) minmax_dist = max(0, LA.norm(BT.loc - q_center) - BT.radius) #print "min %f\nmax %f\nmaxmin %f\nminmax %f\n" % (min_dist, max_dist, maxmin_dist, minmax_dist) return min_dist, max_dist, maxmin_dist, minmax_dist def prune_child_level1(self, q_center, q_radius, BT): # left # check how ball tree implementation refer their children as # Compute the bounds for BOTH the children c1_min_d, c1_max_d, c1_maxmin_d, c1_minmax_d = self.get_bounds( q_center, q_radius, BT.child1) c2_min_d, c2_max_d, c2_maxmin_d, c2_minmax_d = self.get_bounds( q_center, q_radius, BT.child2) # If the lower bound (c1_maxmin_d) of child1 is higher # than upper bound (c2_minmax_d) of child2, prune child1 #if( c1_maxmin_d > c2_minmax_d ): #if( c1_maxmin_d > c2_max_d ): if (c1_maxmin_d > c2_minmax_d): return 1, 0 # If the lower bound (c2_maxmin_d) of child2 is higher # than upper bound (c1_minmax_d) of child1, prune child2 #if( c2_maxmin_d > c1_minmax_d ): #if( c2_maxmin_d > c1_max_d ): if (c2_maxmin_d > c1_minmax_d): return 0, 1 # Nothing to prune! return 0, 0 def brute_force_l2_norm(self, X, idx): # Compute the minmax l2-norm distance now. minmax_eu = float("inf") #print X.getrow(0) for x, id in zip(X, idx): #print x.todense() #print id max_eu = float("-inf") for q in self.Q: #print type(x-q) eu = LA.norm((x - q).toarray()) if eu > max_eu: max_eu = eu max_x = x # unnecessary if max_eu < minmax_eu: minmax_eu = max_eu minmax_x = max_x minmax_id = id #print max_eu #print "for x=(%f,%f) , max_eu is %f" % (x.toarray()[0,0], x.toarray()[0,1],max_eu) #print "for x =(%f,%f) minmax dist is obtained as %f with idx as %d " % (minmax_x.toarray()[0,0],minmax_x.toarray()[0,1],minmax_eu,minmax_id) """ plt.scatter(X.toarray()[:,0],X.toarray()[:,1], s=9 , c='b') plt.scatter(self.Q.toarray()[:,0],self.Q.toarray()[:,1], s=9,c='r') plt.scatter(minmax_x.toarray()[0,0],minmax_x.toarray()[0,1],s=9,c='g') plt.show() """ complexity = X.shape[0] * self.Q.shape[0] return minmax_eu, minmax_x, minmax_id, complexity #print 'minmax_eu : {0}, winning data point : {1}'.format(minmax_eu, minmax_x) def minmaxdist(self, BT, q_center, q_radius, depth): #global curr_minmax #global curr_winner # Leaf Node if BT.child1 is None: print 'I am in child node at depth %d' % (depth) # We shouldn't be dumping a lot of data here as # we hope to prune more branches and hit the leaf # nodes less number of times. #dump_ball_contents(depth,BT) min_d, max_d, maxmin_d, minmax_d = self.get_bounds( q_center, q_radius, BT) print 'Current actual minmax = {0}, Ball minmax = {1}'.format( self.curr_minmax, minmax_d) #print 'Current actual minmax = {0}, Ball minmax bound = {1}'.format(self.curr_minmax,minmax_d) if (minmax_d < self.curr_minmax): #print 'brute force check ' # Now just do a brute force computation win_dist, win_x, win_id, curr_complexity = self.brute_force_l2_norm( BT.data, BT.idx) self.complexity += curr_complexity if (win_dist < self.curr_minmax): #print win_dist #print 'previous minmax was %f where curr minmax is %f' % (self.curr_minmax,win_dist) self.curr_minmax = win_dist self.curr_winner = win_x self.curr_id = win_id #print '----- Current minmax_euclidean, minmax data point = <{0},{1}>'.format(win_dist,win_id) #print '----- Number of points processed = {0}\n'.format(BT.data.shape[0]) else: #print 'therefore not checking' return # Internal Node else: #print 'now in internal node at depth %d' % (depth) #dump_ball_contents(depth,BT) # Compute the bounds for BOTH the children c1_min_d, c1_max_d, c1_maxmin_d, c1_minmax_d = self.get_bounds( q_center, q_radius, BT.child1) c2_min_d, c2_max_d, c2_maxmin_d, c2_minmax_d = self.get_bounds( q_center, q_radius, BT.child2) # Work out what nodes to prune and what to leave! #print 'check which child to prune ' c1_prune, c2_prune = self.prune_child_level1( q_center, q_radius, BT) print '---- pruning flags after level 1 = ({0},{1})'.format( c1_prune, c2_prune) if (c1_prune == 0 and c2_prune == 1): #print '--- pruned child2! ---' print 'Current minmax = {0} where child 1 minmax bound = {1}'.format( self.curr_minmax, c1_minmax_d) if (c1_minmax_d < self.curr_minmax): #print 'going in child 1 ' self.minmaxdist(BT.child1, q_center, q_radius, depth + 1) if (c2_prune == 0 and c1_prune == 1): #print '--- pruned child1! ---' print 'Current minmax = {0} where child 2 minmax bound = {1}'.format( self.curr_minmax, c2_minmax_d) if (c2_minmax_d < self.curr_minmax): #print 'going in child 2' self.minmaxdist(BT.child2, q_center, q_radius, depth + 1) if (c1_prune == 0 and c2_prune == 0): #print '--- No child pruned, so we order them! ---' # First descend down child 1 if (c1_minmax_d < c2_minmax_d): #print 'c1 before c2' print 'Current minmax = {0} where child 1 minmax bound = {1}'.format( self.curr_minmax, c1_minmax_d) if (c1_minmax_d < self.curr_minmax): #print 'going in first child %d' % (depth) self.minmaxdist(BT.child1, q_center, q_radius, depth + 1) print 'Current minmax = {0} where child 2 minmax bound = {1}'.format( self.curr_minmax, c2_minmax_d) if (c2_minmax_d < self.curr_minmax): #print 'going in second child %d' % (depth) self.minmaxdist(BT.child2, q_center, q_radius, depth + 1) else: #print 'c2 before c1' print 'Current minmax = {0} where child 2 minmax bound = {1}'.format( self.curr_minmax, c2_minmax_d) if (c2_minmax_d < self.curr_minmax): #print 'going in second child %d' % (depth) self.minmaxdist(BT.child2, q_center, q_radius, depth + 1) print 'Current minmax = {0} where child 1 minmax bound = {1}'.format( self.curr_minmax, c1_minmax_d) if (c1_minmax_d < self.curr_minmax): #print 'going in first child %d' % (depth) self.minmaxdist(BT.child1, q_center, q_radius, depth + 1) def write_complexity_ratio(self): max_complexity = self.Xtrain.shape[0] * self.Q.shape[0] with open(self.fcomplexity, 'a') as fp: fp.write('search complexity ' + str(self.complexity) + ' out of total ' + str(max_complexity) + ' nodes\n') def active_select(self): self.complexity = 0 q_center, q_radius = self.create_query_ball() #print 'query ball created' #self.prune_child_level1(q_center, q_radius, self.tree) #self.get_bounds(q_center, q_radius,self.tree) self.minmaxdist(self.tree, q_center, q_radius, 0) #self.write_complexity_ratio() #print 'min max dist' #print 'current minmax distance %f and winner sample is %s' % (self.curr_minmax,','.join(str(e) for e in list(self.curr_winner.toarray()))) #idx=0 # to be found #self.brute_force_l2_norm(self.Xtrain) #self.minmaxdist(self.tree, q_center, q_radius, depth) #print self.curr_id #return self.curr_minmax,self.curr_winner,self.curr_id, self.complexity return self.curr_id def check_recursion(self): if self.ck_rec == 0: print 'ck rec %d ' % (self.ck_rec) return print 'ck rec %d' % (self.ck_rec) self.ck_rec -= 1 self.check_recursion() return """Doubt should we return only nearest points? How to deal with repeatation? """ # print the ball at level 4 # whenever getting dist for a leaf draw the circle with red """
def fit(self, X, Y=()): # we need Y to be an integer, because after we'll use it an index self.Y = np.asanyarray(Y, dtype=np.int) self.ball_tree = BallTree(X, self.window_size) return self
class Neighbors(BaseEstimator, ClassifierMixin): """Classifier implementing k-Nearest Neighbor Algorithm. Parameters ---------- data : array-like, shape (n, k) The data points to be indexed. This array is not copied, and so modifying this data will result in bogus results. labels : array An array representing labels for the data (only arrays of integers are supported). n_neighbors : int default number of neighbors. window_size : int Window size passed to BallTree Examples -------- >>> samples = [[0.,0.,1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]] >>> labels = [0,0,1,1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=3) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=3, window_size=1) >>> print neigh.predict([[0,0,0]]) [ 0.] Notes ----- http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ def __init__(self, n_neighbors=5, window_size=1): """Internally uses the ball tree datastructure and algorithm for fast neighbors lookups on high dimensional datasets. """ self.n_neighbors = n_neighbors self.window_size = window_size def fit(self, X, Y=()): # we need Y to be an integer, because after we'll use it an index self.Y = np.asanyarray(Y, dtype=np.int) self.ball_tree = BallTree(X, self.window_size) return self def kneighbors(self, data, n_neighbors=None): """Finds the K-neighbors of a point. Parameters ---------- point : array-like The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point. ind : array Array representing the indices of the nearest points in the population matrix. Examples -------- In the following example, we construnct a Neighbors class from an array representing our data set and ask who's the closest point to [1,1,1] >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> labels = [0, 0, 1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=1) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=1, window_size=1) >>> print neigh.kneighbors([1., 1., 1.]) (array(0.5), array(2)) As you can see, it returns [0.5], and [2], which means that the element is at distance 0.5 and is the third element of samples (indexes start at 0). You can also query for multiple points: >>> print neigh.kneighbors([[0., 1., 0.], [1., 0., 1.]]) (array([ 0.5 , 1.11803399]), array([1, 2])) """ if n_neighbors is None: n_neighbors = self.n_neighbors return self.ball_tree.query(data, k=n_neighbors) def predict(self, T, n_neighbors=None): """Predict the class labels for the provided data. Parameters ---------- test: array A 2-D array representing the test point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- labels: array List of class labels (one for each data sample). Examples -------- >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> labels = [0, 0, 1] >>> from scikits.learn.neighbors import Neighbors >>> neigh = Neighbors(n_neighbors=1) >>> neigh.fit(samples, labels) Neighbors(n_neighbors=1, window_size=1) >>> print neigh.predict([.2, .1, .2]) 0 >>> print neigh.predict([[0., -1., 0.], [3., 2., 0.]]) [0 1] """ T = np.asanyarray(T) if n_neighbors is None: n_neighbors = self.n_neighbors return _predict_from_BallTree(self.ball_tree, self.Y, T, n_neighbors)
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_allclose(dist1, dist2)
def kneighbors_graph(X, n_neighbors, weight=None, ball_tree=None, window_size=1): """Computes the (weighted) graph of k-Neighbors Parameters ---------- X : array-like, shape = [n_samples, n_features] Coordinates of samples. One sample per row. n_neighbors : int Number of neighbors for each sample. weight : None (default) Weights to apply on graph edges. If weight is None then no weighting is applied (1 for each edge). If weight equals "distance" the edge weight is the euclidian distance. If weight equals "barycenter" the weights are barycenter weights estimated by solving a linear system for each point. ball_tree : None or instance of precomputed BallTree window_size : int Window size pass to the BallTree Returns ------- A : sparse matrix, shape = [n_samples, n_samples] A is returned as LInked List Sparse matrix A[i,j] = weight of edge that connects i to j Examples -------- >>> X = [[0], [2], [1]] >>> from scikits.learn.neighbors import kneighbors_graph >>> A = kneighbors_graph(X, 2) >>> A.todense() matrix([[ 1., 0., 1.], [ 0., 1., 1.], [ 0., 1., 1.]]) """ from scipy import sparse X = np.asanyarray(X) n_samples = X.shape[0] if ball_tree is None: ball_tree = BallTree(X, window_size) A = sparse.lil_matrix((n_samples, ball_tree.size)) dist, ind = ball_tree.query(X, k=n_neighbors) if weight is None: for i, li in enumerate(ind): if n_neighbors > 1: A[i, list(li)] = np.ones(n_neighbors) else: A[i, li] = 1.0 elif weight is "distance": for i, li in enumerate(ind): if n_neighbors > 1: A[i, list(li)] = dist[i, :] else: A[i, li] = dist[i, 0] elif weight is "barycenter": # XXX : the next loop could be done in parallel # by parallelizing groups of indices for i, li in enumerate(ind): if n_neighbors > 1: X_i = ball_tree.data[li] A[i, list(li)] = barycenter_weights(X[i], X_i) else: A[i, li] = 1.0 else: raise ValueError("Unknown weight type") return A
rseed = np.random.randint(100000) print "rseed = %i" % rseed np.random.seed(rseed) X = np.random.random((200, 3)) Y = np.random.random((100, 3)) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1, n1 = SBT.query(Y, 3) t1 = time() print "python: %.2g sec" % (t1 - t0) t0 = time() SBT = SlowBallTree(X, leaf_size=10) d1a, n1a = SBT.query_dual(Y, 3) t1 = time() print "python dual: %.2g sec" % (t1 - t0) t0 = time() BT = BallTree(X, leaf_size=10) d2, n2 = BT.query(Y, 3) t1 = time() print "cython: %.2g sec" % (t1 - t0) print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1) print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
def create_ball_tree(self): # done idx = np.array(range(self.Xtrain.shape[0])) self.tree = BallTree(self.Xtrain, self.leaf_size, idx)
if len(heap) > k: heap.pop() for candidate in heap: # print(candidate) x_, y_ = candidate[0] plt.plot(x_, y_, 'bo', color='pink') print(distances) all = True for candidate in heap: if not candidate[1] in s: all = False break print('All found in the brute force approach? %s' % all) tree = BallTree(points, euclid_metric) distance_balls = knn(tree, point, k, euclid_metric) # print(len(distance_balls)) # print(distance_balls) all = True for candidate in distance_balls: x, y = candidate[0] plt.plot(x, y, 'bo', color='#00ff00') if not candidate[1] in s: all = False break print('All found in the ball tree approach? %s' % all) # traverse(tree, plt) plt.show()
from time import time import numpy as np from ball_tree import BallTree, KDTree from sklearn import neighbors X = np.random.random((20000, 3)) X_query = np.random.random((20000, 3)) t0 = time() BT = BallTree(X, 30) t1 = time() print "BT construction: %.2g sec" % (t1 - t0) t0 = time() KDT = KDTree(X, 30) t1 = time() print "KDT construction: %.2g sec" % (t1 - t0) for k in 1, 2, 4, 8: print "\nquery %i in [%i, %i]:" % (k, X.shape[0], X.shape[1]) print " single dual" t0 = time() d1, i1 = BT.query(X_query, k, dualtree=False) t1 = time() d1, i1 = BT.query(X_query, k, dualtree=True) t2 = time() print " BT: %.3g sec %.3g sec" % (t1 - t0, t2 - t1) d2, i2 = KDT.query(X_query, k, dualtree=False) t3 = time() d2, i2 = KDT.query(X_query, k, dualtree=True)