예제 #1
0
def test_ball_tree_KDE(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))
    Y = np.random.random((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    for kernel in [
            'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
            'cosine'
    ]:
        for h in [0.001, 0.01, 0.1]:
            dens_true = compute_kernel_slow(Y, X, kernel, h)

            def check_results(kernel, h, atol, rtol, dualtree, breadth_first):
                dens = bt.kernel_density(Y,
                                         h,
                                         atol=atol,
                                         rtol=rtol,
                                         kernel=kernel,
                                         dualtree=dualtree,
                                         breadth_first=breadth_first)
                assert_allclose(dens, dens_true, atol=atol, rtol=rtol)

            for rtol in [0, 1E-5]:
                for atol in [1E-10, 1E-5, 0.1]:
                    for dualtree in (True, False):
                        if dualtree and rtol > 0:
                            continue
                        for breadth_first in (True, False):
                            yield (check_results, kernel, h, atol, rtol,
                                   dualtree, breadth_first)
예제 #2
0
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30):
    print("Ball Tree")
    X = np.random.random((N, D)).astype(DTYPE)

    t0 = time()
    btskl = skBallTree(X, leaf_size=leaf_size)
    t1 = time()
    bt = BallTree(X, leaf_size=leaf_size)
    t2 = time()

    print("Build:")
    print("  sklearn : %.2g sec" % (t1 - t0))
    print("  new     : %.2g sec" % (t2 - t1))

    t0 = time()
    Dskl, Iskl = btskl.query(X, k)
    t1 = time()

    dist = [Dskl]
    ind = [Iskl]
    times = [t1 - t0]
    labels = ['sklearn']
    counts = [-1]

    for dualtree in (False, True):
        for breadth_first in (False, True):
            bt.reset_n_calls()
            t0 = time()
            D, I = bt.query(X,
                            k,
                            dualtree=dualtree,
                            breadth_first=breadth_first)
            t1 = time()
            dist.append(D)
            ind.append(I)
            times.append(t1 - t0)
            counts.append(bt.get_n_calls())

            if dualtree:
                label = 'dual/'
            else:
                label = 'single/'

            if breadth_first:
                label += 'breadthfirst'
            else:
                label += 'depthfirst'
            labels.append(label)

    print("Query:")
    for lab, t, c in zip(labels, times, counts):
        print("  %s : %.2g sec (%i calls)" % (lab, t, c))
    print
    print(
        " distances match: %s" % ', '.join([
            '%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist))
        ]))
    print(
        " indices match: %s" % ', '.join(
            ['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
예제 #3
0
    def _check_p_distance_vs_KDT(self, p):
        bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p)
        kdt = cKDTree(self.X, leafsize=10)

        dist_bt, ind_bt = bt.query(self.X, k=5)
        dist_kd, ind_kd = kdt.query(self.X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #4
0
def test_ball_tree_query():
    X = np.random.random(size=(100, 5))

    for k in (2, 4, 6):
        bt = BallTree(X)
        kdt = cKDTree(X)

        dist_bt, ind_bt = bt.query(X, k=k)
        dist_kd, ind_kd = kdt.query(X, k=k)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #5
0
def test_ball_tree_p_distance():
    X = np.random.random(size=(100, 5))

    for p in (1, 2, 3, 4, np.inf):
        bt = BallTree(X, leaf_size=10, metric='minkowski', p=p)
        kdt = cKDTree(X, leafsize=10)

        dist_bt, ind_bt = bt.query(X, k=5)
        dist_kd, ind_kd = kdt.query(X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #6
0
    def check_neighbors(dualtree, breadth_first, k, metric, kwargs):
        bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
        dist1, ind1 = bt.query(Y,
                               k,
                               dualtree=dualtree,
                               breadth_first=breadth_first)
        dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

        # don't check indices here: if there are any duplicate distances,
        # the indices may not match.  Distances should not have this problem.
        assert_allclose(dist1, dist2)
예제 #7
0
def main():

    check('nbrs.radius_neighbors(p, radius)', setup_str_bt)
    check('nbrs.radius_neighbors(p, radius)', setup_str_bf)

    n, d = 1000, 3
    X = np.random.rand(n, d)
    p = X[0]
    radius = 0.4
    ball_tree_inds = BallTree(X).radius_neighbors(p, radius)
    brute_force_inds = BruteForce(X).radius_neighbors(p, radius)
    print(ball_tree_inds == brute_force_inds)
예제 #8
0
    def _check_metrics_float(self, k, metric, kwargs):
        bt = BallTree(self.X, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.X, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.pdist(self.X, squareform=True)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm]

        # we don't check the indices here because if there is a tie for
        # nearest neighbor, then the test may fail.  Distances will reflect
        # whether the search was successful
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #9
0
    def _check_metrics_bool(self, k, metric, kwargs):
        bt = BallTree(self.Xbool, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.Ybool, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.cdist(self.Ybool, self.Xbool)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm]

        # we don't check the indices here because there are very often
        # ties for nearest neighbors, which cause the test to fail.
        # Distances will be correct in either case.
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #10
0
def test_ball_tree_query_radius_count(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.pdist(X, squareform=True)

    r = np.mean(D)

    bt = BallTree(X)
    count1 = bt.query_radius(X, r, count_only=True)

    count2 = (D <= r).sum(1)

    assert_array_almost_equal(count1, count2)
예제 #11
0
    def test_query_radius_count(self):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.pdist(X, squareform=True)

        r = np.mean(D)

        bt = BallTree(X)
        count1 = bt.query_radius(X, r, count_only=True)

        count2 = (D <= r).sum(1)

        assert_array_almost_equal(count1, count2)
예제 #12
0
def test_ball_tree_two_point(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))
    Y = np.random.random((n_samples, n_features))
    r = np.linspace(0, 1, 10)
    bt = BallTree(X, leaf_size=10)

    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
    counts_true = [(D <= ri).sum() for ri in r]

    def check_two_point(r, dualtree):
        counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree)
        assert_allclose(counts, counts_true)

    for dualtree in (True, False):
        yield check_two_point, r, dualtree
예제 #13
0
    def test_query_radius_indices(self, n_queries=20):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.cdist(X[:n_queries], X)
        r = np.mean(D)

        bt = BallTree(X)
        ind = bt.query_radius(X[:n_queries], r, return_distance=False)
        ind2 = np.zeros(D.shape) + np.arange(D.shape[1])

        ind = np.concatenate(map(np.sort, ind))
        ind2 = ind2[D <= r]

        assert_array_almost_equal(ind, ind2)
예제 #14
0
def test_ball_tree_pickle():
    import pickle
    np.random.seed(0)
    X = np.random.random((10, 3))
    bt1 = BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert_allclose(ind1, ind2)
        assert_allclose(dist1, dist2)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
예제 #15
0
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind])**2).sum(1))

        assert_array_almost_equal(d, dist)
예제 #16
0
def test_ball_tree_query_radius(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = bt.query_radius(query_pt, r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_allclose(i, ind)
예제 #17
0
    def test_query_radius_distance(self):
        # center the data
        X = 2 * self.X - 1

        # choose a query point near the origin
        query_pt = 0.01 * X[:1]

        eps = 1E-15  # roundoff error can cause test to fail
        bt = BallTree(X, leaf_size=5)

        # compute reference distances
        dm = DistanceMetric()
        dist_true = dm.cdist(query_pt, X)[0]
        dist_true.sort()

        for r in np.linspace(dist_true[0], dist_true[-1], 10):
            yield (self._check_query_radius_distance, X, bt, query_pt,
                   dist_true, r, eps)
예제 #18
0
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.cdist(X[:10], X)

    r = np.mean(D)

    bt = BallTree(X)
    ind = bt.query_radius(X[:10], r, return_distance=False)

    for i in range(10):
        ind1 = ind[i]
        ind2 = np.where(D[i] <= r)[0]

        ind1.sort()
        ind2.sort()

        assert_array_almost_equal(ind1, ind2)
예제 #19
0
    def __init__(self, training_data_size_ratio: float, k: int = 5):
        # Split dataset into training and testing data
        self.k = k
        self.dataset = prepare_data().values
        np.random.shuffle(self.dataset)
        size = int(len(self.dataset) * training_data_size_ratio)
        # Get the labels(unique values situated on the last column in the dataset)
        self.classes = set(self.dataset[:, -1])

        result_col = self.dataset.shape[1] - 1
        self.training_data = Classifier.create_classes(self.dataset[:size],
                                                       result_col,
                                                       self.classes)
        self.test_data = Classifier.create_classes(self.dataset[size:],
                                                   result_col, self.classes)

        # build the trees for each class in the training set
        self.training_trees = dict(
            (class_, BallTree(data, euclid_metric))
            for class_, data in self.training_data.items())
 def active_select(self):
     # generate ball tree for query variables
     idx = np.array(range(self.Q.shape[0]))
     Qtree = BallTree(self.Q, self.leaf_size, idx)
     # for each data point x ,
     # find minimum distance of set of query as min_dist(x)
     # count the number of query which has l2 distance from x  <= min_dist(x)+self.delta
     max_count_global = 0  # contains globally maximum number of query within specified range over all data points
     max_query_x_id = 0  # id of datapoint which posses maximum number of query within bound as specified
     # iterate over all data points
     for x, id in zip(self.Xtrain, range(self.Xtrain.shape[0])):
         # each data point maintains following list of distances of querypoints where the distance values are within
         # a threshold of minimum distance
         self.dist_list = []
         self.min_dist = float('inf')
         self.upper_b = self.min_dist + self.delta
         # updates above two variables
         self.max_query(x, Qtree, depth=0)
         # Count number of query within bound for x
         count = len(self.dist_list)
         if count > max_count_global:
             max_count_global = count
             max_query_x_id = id
     return max_query_x_id
예제 #21
0
        if len(heap) > k:
            heap.pop()
    for candidate in heap:
        # print(candidate)
        x_, y_ = candidate[0]
        plt.plot(x_, y_, 'bo', color='pink')

    print(distances)
    all = True
    for candidate in heap:
        if not candidate[1] in s:
            all = False
            break
    print('All found in the brute force approach? %s' % all)

    tree = BallTree(points, euclid_metric)
    distance_balls = knn(tree, point, k, euclid_metric)
    # print(len(distance_balls))
    # print(distance_balls)

    all = True
    for candidate in distance_balls:
        x, y = candidate[0]
        plt.plot(x, y, 'bo', color='#00ff00')
        if not candidate[1] in s:
            all = False
            break
    print('All found in the ball tree approach?   %s' % all)

    # traverse(tree, plt)
    plt.show()
예제 #22
0
 def test_pickle(self):
     bt1 = BallTree(self.X, leaf_size=1)
     ind1, dist1 = bt1.query(self.X)
     for protocol in (0, 1, 2):
         yield (self._check_pickle, protocol, bt1, ind1, dist1)
예제 #23
0
from time import time
import numpy as np
from ball_tree import BallTree

X = np.random.random((10000, 3))

t0 = time()
BT = BallTree(X, 30)
t1 = time()
print "construction: %.2g sec" % (t1 - t0)

for k in [1, 2, 4, 8]:
    for dual in (False, True):
        t0 = time()
        BT.query(X, k, dualtree=dual)
        t1 = time()
        if dual:
            dual_str = ' (dual)'
        else:
            dual_str = ''
        print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0], X.shape[1],
                                                    dual_str, t1 - t0)

for r in 0.1, 0.3, 0.5:
    t0 = time()
    BT.query_radius(X[:1000], r)
    t1 = time()
    print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1],
                                                  t1 - t0)
예제 #24
0
 def test_query_knn(self):
     bt = BallTree(self.X)
     kdt = cKDTree(self.X)
     for k in (1, 2, 4, 8, 16):
         for dualtree in [True, False]:
             yield (self._check_query_knn, bt, kdt, k, dualtree)
예제 #25
0
 def check_neighbors(metric):
     bt = BallTree(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_allclose(dist1, dist2)
예제 #26
0
def bench_KDE(N=1000, D=3, h=0.5, leaf_size=30):
    X = np.random.random((N, D))
    bt = BallTree(X, leaf_size=leaf_size)
    kernel = 'gaussian'

    print "Kernel Density:"
    atol = 1E-5
    rtol = 1E-5

    for h in [0.001, 0.01, 0.1]:
        t0 = time()
        dens_true = np.exp(-0.5 * ((X[:, None, :] - X)**2).sum(-1) /
                           h**2).sum(-1)
        dens_true /= h * np.sqrt(2 * np.pi)
        t1 = time()

        bt.reset_n_calls()
        t2 = time()
        dens1 = bt.kernel_density(X,
                                  h,
                                  atol=atol,
                                  rtol=rtol,
                                  kernel=kernel,
                                  dualtree=False,
                                  breadth_first=True)
        t3 = time()
        n1 = bt.get_n_calls()

        bt.reset_n_calls()
        t4 = time()
        dens2 = bt.kernel_density(X,
                                  h,
                                  atol=atol,
                                  rtol=rtol,
                                  kernel=kernel,
                                  dualtree=False,
                                  breadth_first=False)
        t5 = time()
        n2 = bt.get_n_calls()

        bt.reset_n_calls()
        t6 = time()
        dens3 = bt.kernel_density(X,
                                  h,
                                  atol=atol,
                                  kernel=kernel,
                                  dualtree=True,
                                  breadth_first=True)
        t7 = time()
        n3 = bt.get_n_calls()

        bt.reset_n_calls()
        t8 = time()
        dens4 = bt.kernel_density(X,
                                  h,
                                  atol=atol,
                                  kernel=kernel,
                                  dualtree=True,
                                  breadth_first=False)
        t9 = time()
        n4 = bt.get_n_calls()

        print " h = %.3f" % h
        print "   brute force: %.2g sec (%i calls)" % (t1 - t0, N * N)
        print("   single tree (depth first): %.2g sec (%i calls)" %
              (t3 - t2, n1))
        print("   single tree (breadth first): %.2g sec (%i calls)" %
              (t5 - t4, n2))
        print("   dual tree: (depth first) %.2g sec (%i calls)" %
              (t7 - t6, n3))
        print("   dual tree: (breadth first) %.2g sec (%i calls)" %
              (t9 - t8, n4))
        print "   distances match:", (np.allclose(dens_true,
                                                  dens1,
                                                  atol=atol,
                                                  rtol=rtol),
                                      np.allclose(dens_true,
                                                  dens2,
                                                  atol=atol,
                                                  rtol=rtol),
                                      np.allclose(dens_true, dens3, atol=atol),
                                      np.allclose(dens_true, dens4, atol=atol))
예제 #27
0
    rseed = np.random.randint(100000)
    print "rseed = %i" % rseed
    np.random.seed(rseed)
    X = np.random.random((200, 3))
    Y = np.random.random((100, 3))

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1, n1 = SBT.query(Y, 3)
    t1 = time()

    print "python: %.2g sec" % (t1 - t0)

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1a, n1a = SBT.query_dual(Y, 3)
    t1 = time()

    print "python dual: %.2g sec" % (t1 - t0)

    t0 = time()
    BT = BallTree(X, leaf_size=10)
    d2, n2 = BT.query(Y, 3)
    t1 = time()

    print "cython: %.2g sec" % (t1 - t0)

    print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1)
    print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
예제 #28
0
 def create_ball_tree(self):  # done
     idx = np.array(range(self.Xtrain.shape[0]))
     self.tree = BallTree(self.Xtrain, self.leaf_size, idx)