예제 #1
0
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30):
    print("Ball Tree")
    X = np.random.random((N, D)).astype(DTYPE)

    t0 = time()
    btskl = skBallTree(X, leaf_size=leaf_size)
    t1 = time()
    bt = BallTree(X, leaf_size=leaf_size)
    t2 = time()

    print("Build:")
    print("  sklearn : %.2g sec" % (t1 - t0))
    print("  new     : %.2g sec" % (t2 - t1))

    t0 = time()
    Dskl, Iskl = btskl.query(X, k)
    t1 = time()

    dist = [Dskl]
    ind = [Iskl]
    times = [t1 - t0]
    labels = ['sklearn']
    counts = [-1]

    for dualtree in (False, True):
        for breadth_first in (False, True):
            bt.reset_n_calls()
            t0 = time()
            D, I = bt.query(X,
                            k,
                            dualtree=dualtree,
                            breadth_first=breadth_first)
            t1 = time()
            dist.append(D)
            ind.append(I)
            times.append(t1 - t0)
            counts.append(bt.get_n_calls())

            if dualtree:
                label = 'dual/'
            else:
                label = 'single/'

            if breadth_first:
                label += 'breadthfirst'
            else:
                label += 'depthfirst'
            labels.append(label)

    print("Query:")
    for lab, t, c in zip(labels, times, counts):
        print("  %s : %.2g sec (%i calls)" % (lab, t, c))
    print
    print(
        " distances match: %s" % ', '.join([
            '%s' % np.allclose(dist[i - 1], dist[i]) for i in range(len(dist))
        ]))
    print(
        " indices match: %s" % ', '.join(
            ['%s' % np.allclose(ind[i - 1], ind[i]) for i in range(len(ind))]))
예제 #2
0
    def _check_p_distance_vs_KDT(self, p):
        bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p)
        kdt = cKDTree(self.X, leafsize=10)

        dist_bt, ind_bt = bt.query(self.X, k=5)
        dist_kd, ind_kd = kdt.query(self.X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #3
0
    def _check_p_distance_vs_KDT(self, p):
        bt = BallTree(self.X, leaf_size=10, metric='minkowski', p=p)
        kdt = cKDTree(self.X, leafsize=10)

        dist_bt, ind_bt = bt.query(self.X, k=5)
        dist_kd, ind_kd = kdt.query(self.X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #4
0
    def check_neighbors(dualtree, breadth_first, k, metric, kwargs):
        bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
        dist1, ind1 = bt.query(Y, k, dualtree=dualtree,
                               breadth_first=breadth_first)
        dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

        # don't check indices here: if there are any duplicate distances,
        # the indices may not match.  Distances should not have this problem.
        assert_allclose(dist1, dist2)
예제 #5
0
def bench_ball_tree(N=2000, D=3, k=15, leaf_size=30):
    print("Ball Tree")
    X = np.random.random((N, D)).astype(DTYPE)

    t0 = time()
    btskl = skBallTree(X, leaf_size=leaf_size)
    t1 = time()
    bt = BallTree(X, leaf_size=leaf_size)
    t2 = time()

    print("Build:")
    print("  sklearn : %.2g sec" % (t1 - t0))
    print("  new     : %.2g sec" % (t2 - t1))

    t0 = time()
    Dskl, Iskl = btskl.query(X, k)
    t1 = time()

    dist = [Dskl]
    ind = [Iskl]
    times = [t1 - t0]
    labels = ['sklearn']
    counts = [-1]

    for dualtree in (False, True):
        for breadth_first in (False, True):
            bt.reset_n_calls()
            t0 = time()
            D, I = bt.query(X, k, dualtree=dualtree,
                            breadth_first=breadth_first)
            t1 = time()
            dist.append(D)
            ind.append(I)
            times.append(t1 - t0)
            counts.append(bt.get_n_calls())

            if dualtree:
                label = 'dual/'
            else:
                label = 'single/'

            if breadth_first:
                label += 'breadthfirst'
            else:
                label += 'depthfirst'
            labels.append(label)

    print("Query:")
    for lab, t, c in zip(labels, times, counts):
        print("  %s : %.2g sec (%i calls)" % (lab, t, c))
    print
    print(" distances match: %s"
          % ', '.join(['%s' % np.allclose(dist[i - 1], dist[i])
                       for i in range(len(dist))]))
    print(" indices match: %s"
          % ', '.join(['%s' % np.allclose(ind[i - 1], ind[i])
                       for i in range(len(ind))]))
예제 #6
0
    def check_neighbors(dualtree, breadth_first, k, metric, kwargs):
        bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
        dist1, ind1 = bt.query(Y,
                               k,
                               dualtree=dualtree,
                               breadth_first=breadth_first)
        dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

        # don't check indices here: if there are any duplicate distances,
        # the indices may not match.  Distances should not have this problem.
        assert_allclose(dist1, dist2)
예제 #7
0
def test_ball_tree_query():
    X = np.random.random(size=(100, 5))

    for k in (2, 4, 6):
        bt = BallTree(X)
        kdt = cKDTree(X)

        dist_bt, ind_bt = bt.query(X, k=k)
        dist_kd, ind_kd = kdt.query(X, k=k)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #8
0
def test_ball_tree_query():
    X = np.random.random(size=(100, 5))

    for k in (2, 4, 6):
        bt = BallTree(X)
        kdt = cKDTree(X)

        dist_bt, ind_bt = bt.query(X, k=k)
        dist_kd, ind_kd = kdt.query(X, k=k)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #9
0
def test_ball_tree_p_distance():
    X = np.random.random(size=(100, 5))

    for p in (1, 2, 3, 4, np.inf):
        bt = BallTree(X, leaf_size=10, metric="minkowski", p=p)
        kdt = cKDTree(X, leafsize=10)

        dist_bt, ind_bt = bt.query(X, k=5)
        dist_kd, ind_kd = kdt.query(X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #10
0
def test_ball_tree_p_distance():
    X = np.random.random(size=(100, 5))

    for p in (1, 2, 3, 4, np.inf):
        bt = BallTree(X, leaf_size=10, metric='minkowski', p=p)
        kdt = cKDTree(X, leafsize=10)

        dist_bt, ind_bt = bt.query(X, k=5)
        dist_kd, ind_kd = kdt.query(X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
예제 #11
0
    def _check_metrics_bool(self, k, metric, kwargs):
        bt = BallTree(self.Xbool, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.Ybool, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.cdist(self.Ybool, self.Xbool)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm]

        # we don't check the indices here because there are very often
        # ties for nearest neighbors, which cause the test to fail.
        # Distances will be correct in either case.
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #12
0
    def _check_metrics_float(self, k, metric, kwargs):
        bt = BallTree(self.X, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.X, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.pdist(self.X, squareform=True)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm]

        # we don't check the indices here because if there is a tie for
        # nearest neighbor, then the test may fail.  Distances will reflect
        # whether the search was successful
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #13
0
    def _check_metrics_bool(self, k, metric, kwargs):
        bt = BallTree(self.Xbool, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.Ybool, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.cdist(self.Ybool, self.Xbool)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm]
        
        # we don't check the indices here because there are very often
        # ties for nearest neighbors, which cause the test to fail.
        # Distances will be correct in either case.
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #14
0
    def _check_metrics_float(self, k, metric, kwargs):
        bt = BallTree(self.X, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.X, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.pdist(self.X, squareform=True)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm]

        # we don't check the indices here because if there is a tie for
        # nearest neighbor, then the test may fail.  Distances will reflect
        # whether the search was successful
        assert_array_almost_equal(dist_bt, dist_dm)
예제 #15
0
def test_ball_tree_pickle():
    import pickle
    np.random.seed(0)
    X = np.random.random((10, 3))
    bt1 = BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert_allclose(ind1, ind2)
        assert_allclose(dist1, dist2)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
예제 #16
0
def test_ball_tree_pickle():
    import pickle
    np.random.seed(0)
    X = np.random.random((10, 3))
    bt1 = BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert_allclose(ind1, ind2)
        assert_allclose(dist1, dist2)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
예제 #17
0
def kneighbors_graph(X, n_neighbors, weight=None, ball_tree=None,
                     window_size=1):
    """Computes the (weighted) graph of k-Neighbors

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        Coordinates of samples. One sample per row.

    n_neighbors : int
        Number of neighbors for each sample.

    weight : None (default)
        Weights to apply on graph edges. If weight is None
        then no weighting is applied (1 for each edge).
        If weight equals "distance" the edge weight is the
        euclidian distance. If weight equals "barycenter"
        the weights are barycenter weights estimated by
        solving a linear system for each point.

    ball_tree : None or instance of precomputed BallTree

    window_size : int
        Window size pass to the BallTree

    Returns
    -------
    A : sparse matrix, shape = [n_samples, n_samples]
        A is returned as LInked List Sparse matrix
        A[i,j] = weight of edge that connects i to j

    Examples
    --------
    >>> X = [[0], [2], [1]]
    >>> from scikits.learn.neighbors import kneighbors_graph
    >>> A = kneighbors_graph(X, 2)
    >>> A.todense()
    matrix([[ 1.,  0.,  1.],
            [ 0.,  1.,  1.],
            [ 0.,  1.,  1.]])
    """
    from scipy import sparse
    X = np.asanyarray(X)
    n_samples = X.shape[0]
    if ball_tree is None:
        ball_tree = BallTree(X, window_size)
    A = sparse.lil_matrix((n_samples, ball_tree.size))
    dist, ind = ball_tree.query(X, k=n_neighbors)
    if weight is None:
        for i, li in enumerate(ind):
            if n_neighbors > 1:
                A[i, list(li)] = np.ones(n_neighbors)
            else:
                A[i, li] = 1.0
    elif weight is "distance":
        for i, li in enumerate(ind):
            if n_neighbors > 1:
                A[i, list(li)] = dist[i, :]
            else:
                A[i, li] = dist[i, 0]
    elif weight is "barycenter":
        # XXX : the next loop could be done in parallel
        # by parallelizing groups of indices
        for i, li in enumerate(ind):
            if n_neighbors > 1:
                X_i = ball_tree.data[li]
                A[i, list(li)] = barycenter_weights(X[i], X_i)
            else:
                A[i, li] = 1.0
    else:
        raise ValueError("Unknown weight type")
    return A
예제 #18
0
    rseed = np.random.randint(100000)
    print "rseed = %i" % rseed
    np.random.seed(rseed)
    X = np.random.random((200, 3))
    Y = np.random.random((100, 3))

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1, n1 = SBT.query(Y, 3)
    t1 = time()

    print "python: %.2g sec" % (t1 - t0)

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1a, n1a = SBT.query_dual(Y, 3)
    t1 = time()

    print "python dual: %.2g sec" % (t1 - t0)

    t0 = time()
    BT = BallTree(X, leaf_size=10)
    d2, n2 = BT.query(Y, 3)
    t1 = time()

    print "cython: %.2g sec" % (t1 - t0)

    print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1)
    print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
예제 #19
0
from time import time
import numpy as np
from ball_tree import BallTree

X = np.random.random((10000, 3))

t0 = time()
BT = BallTree(X, 30)
t1 = time()
print "construction: %.2g sec" % (t1 - t0)

for k in [1, 2, 4, 8]:
    for dual in (False, True):
        t0 = time()
        BT.query(X, k, dualtree=dual)
        t1 = time()
        if dual:
            dual_str = ' (dual)'
        else:
            dual_str = ''
        print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0], X.shape[1],
                                                    dual_str, t1 - t0)

for r in 0.1, 0.3, 0.5:
    t0 = time()
    BT.query_radius(X[:1000], r)
    t1 = time()
    print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1],
                                                  t1 - t0)
예제 #20
0
class Neighbors(BaseEstimator, ClassifierMixin):
    """Classifier implementing k-Nearest Neighbor Algorithm.

    Parameters
    ----------
    data : array-like, shape (n, k)
        The data points to be indexed. This array is not copied, and so
        modifying this data will result in bogus results.
    labels : array
        An array representing labels for the data (only arrays of
        integers are supported).
    n_neighbors : int
        default number of neighbors.
    window_size : int
        Window size passed to BallTree

    Examples
    --------
    >>> samples = [[0.,0.,1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
    >>> labels = [0,0,1,1]
    >>> from scikits.learn.neighbors import Neighbors
    >>> neigh = Neighbors(n_neighbors=3)
    >>> neigh.fit(samples, labels)
    Neighbors(n_neighbors=3, window_size=1)
    >>> print neigh.predict([[0,0,0]])
    [ 0.]

    Notes
    -----
    http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
    """

    def __init__(self, n_neighbors=5, window_size=1):
        """Internally uses the ball tree datastructure and algorithm for fast
        neighbors lookups on high dimensional datasets.
        """
        self.n_neighbors = n_neighbors
        self.window_size = window_size

    def fit(self, X, Y=()):
        # we need Y to be an integer, because after we'll use it an index
        self.Y = np.asanyarray(Y, dtype=np.int)
        self.ball_tree = BallTree(X, self.window_size)
        return self

    def kneighbors(self, data, n_neighbors=None):
        """Finds the K-neighbors of a point.

        Parameters
        ----------
        point : array-like
            The new point.
        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point.
        ind : array
            Array representing the indices of the nearest points in the
            population matrix.

        Examples
        --------
        In the following example, we construnct a Neighbors class from an
        array representing our data set and ask who's the closest point to
        [1,1,1]

        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
        >>> labels = [0, 0, 1]
        >>> from scikits.learn.neighbors import Neighbors
        >>> neigh = Neighbors(n_neighbors=1)
        >>> neigh.fit(samples, labels)
        Neighbors(n_neighbors=1, window_size=1)
        >>> print neigh.kneighbors([1., 1., 1.])
        (array(0.5), array(2))

        As you can see, it returns [0.5], and [2], which means that the
        element is at distance 0.5 and is the third element of samples
        (indexes start at 0). You can also query for multiple points:

        >>> print neigh.kneighbors([[0., 1., 0.], [1., 0., 1.]])
        (array([ 0.5       ,  1.11803399]), array([1, 2]))

        """
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        return self.ball_tree.query(data, k=n_neighbors)

    def predict(self, T, n_neighbors=None):
        """Predict the class labels for the provided data.

        Parameters
        ----------
        test: array
            A 2-D array representing the test point.
        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        labels: array
            List of class labels (one for each data sample).

        Examples
        --------
        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
        >>> labels = [0, 0, 1]
        >>> from scikits.learn.neighbors import Neighbors
        >>> neigh = Neighbors(n_neighbors=1)
        >>> neigh.fit(samples, labels)
        Neighbors(n_neighbors=1, window_size=1)
        >>> print neigh.predict([.2, .1, .2])
        0
        >>> print neigh.predict([[0., -1., 0.], [3., 2., 0.]])
        [0 1]
        """
        T = np.asanyarray(T)
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        return _predict_from_BallTree(self.ball_tree, self.Y, T, n_neighbors)
예제 #21
0
from time import time
import numpy as np
from ball_tree import BallTree

X = np.random.random((10000, 3))

t0 = time()
BT = BallTree(X, 30)
t1 = time()
print "construction: %.2g sec" % (t1 - t0)

for k in [1, 2, 4, 8]:
    for dual in (False, True):
        t0 = time()
        BT.query(X, k, dualtree=dual)
        t1 = time()
        if dual:
            dual_str = ' (dual)'
        else:
            dual_str = ''
        print "query %i in [%i, %i]%s: %.3g sec" % (k, X.shape[0],
                                                    X.shape[1],
                                                    dual_str,
                                                    t1 - t0)
    
for r in 0.1, 0.3, 0.5:
    t0 = time()
    BT.query_radius(X[:1000], r)
    t1 = time()
    print "query r<%.1f in [%i, %i]: %.3g sec" % (r, X.shape[0], X.shape[1],
                                                  t1 - t0)
예제 #22
0
 def test_pickle(self):
     bt1 = BallTree(self.X, leaf_size=1)
     ind1, dist1 = bt1.query(self.X)
     for protocol in (0, 1, 2):
         yield (self._check_pickle, protocol, bt1, ind1, dist1)
예제 #23
0
 def test_pickle(self):
     bt1 = BallTree(self.X, leaf_size=1)
     ind1, dist1 = bt1.query(self.X)
     for protocol in (0, 1, 2):
         yield (self._check_pickle, protocol, bt1, ind1, dist1)
예제 #24
0
 def check_neighbors(metric):
     bt = BallTree(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_allclose(dist1, dist2)
예제 #25
0
    rseed = np.random.randint(100000)
    print "rseed = %i" % rseed
    np.random.seed(rseed)
    X = np.random.random((200, 3))
    Y = np.random.random((100, 3))

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1, n1 = SBT.query(Y, 3)
    t1 = time()

    print "python: %.2g sec" % (t1 - t0)

    t0 = time()
    SBT = SlowBallTree(X, leaf_size=10)
    d1a, n1a = SBT.query_dual(Y, 3)
    t1 = time()

    print "python dual: %.2g sec" % (t1 - t0)

    t0 = time()
    BT = BallTree(X, leaf_size=10)
    d2, n2 = BT.query(Y, 3)
    t1 = time()

    print "cython: %.2g sec" % (t1 - t0)

    print "neighbors match:", np.allclose(n1, n2), np.allclose(n1a, n1)
    print "distances match:", np.allclose(d1, d2), np.allclose(d1a, d1)
예제 #26
0
 def check_neighbors(metric):
     bt = BallTree(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_allclose(dist1, dist2)
예제 #27
0
t0 = time()
BT = BallTree(X, 30)
t1 = time()
print "BT construction: %.2g sec" % (t1 - t0)

t0 = time()
KDT = KDTree(X, 30)
t1 = time()
print "KDT construction: %.2g sec" % (t1 - t0)

for k in 1, 2, 4, 8:
    print "\nquery %i in [%i, %i]:" % (k, X.shape[0], X.shape[1])
    print "      single     dual"
    t0 = time()
    d1, i1 = BT.query(X_query, k, dualtree=False)
    t1 = time()
    d1, i1 = BT.query(X_query, k, dualtree=True)
    t2 = time()
    print "  BT: %.3g sec   %.3g sec" % (t1 - t0, t2 - t1)

    d2, i2 = KDT.query(X_query, k, dualtree=False)
    t3 = time()
    d2, i2 = KDT.query(X_query, k, dualtree=True)
    t4 = time()
    print "  KDT: %.3g sec   %.3g sec" % (t3 - t2, t4 - t3)
    print "       (results match: %s)" % np.allclose(d1, d2)

#for r in 0.1, 0.3, 0.5:
#    for tree in (BT, KDT):
#        t0 = time()