Beispiel #1
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        lsh_forest = LSH_forest(number_of_trees=10)
        lsh_forest.build_index(X)

        return lsh_forest
 def build_index(self, X):
     f = X.shape[1]
     n = X.shape[0]
     
     lsh_forest = LSH_forest(number_of_trees=10)
     lsh_forest.build_index(X)
     
     return lsh_forest
def precision_test_candidates_LSH_F_random(X, limits = [10, 100, 1000], iterations = 100):
    f = X.shape[1]
    n = X.shape[0]

    t = LSH_forest(number_of_trees=10)
    t.build_index(X)
    
    limits = limits
    k = 10
    prec_sum = {}
    prec_n = iterations
    time_sum = {}
    candidate_sum = {}

    for i in xrange(prec_n):
        query_vector = np.random.random(size=f)
        print 'finding nbs for a random vector'
        
        neighbors, candidates = t.query_num_candidates(query_vector, c=n)
        closest = set(neighbors[:k])
        for limit in limits:
            t0 = time.time()
            neighbors, candidates= t.query_num_candidates(query_vector, c=limit)
            T = time.time() - t0
            toplist = neighbors[:k]
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T
            candidate_sum[limit] = candidate_sum.get(limit, 0) + candidates            

        for limit in limits:
            print 'limit: %.8f precision: %6.2f%% avg time: %.6fs avg_candiates: %.1f' % (limit, 100.0 * 
                                                                      prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1), 
                                                                      candidate_sum[limit]/ (i+1))
    
    return prec_sum, time_sum, candidate_sum
Beispiel #4
0
def precision_test_candidates_LSH_F_random(X,
                                           limits=[10, 100, 1000],
                                           iterations=100):
    f = X.shape[1]
    n = X.shape[0]

    t = LSH_forest(number_of_trees=10)
    t.build_index(X)

    limits = limits
    k = 10
    prec_sum = {}
    prec_n = iterations
    time_sum = {}
    candidate_sum = {}

    for i in xrange(prec_n):
        query_vector = np.random.random(size=f)
        print 'finding nbs for a random vector'

        neighbors, candidates = t.query_num_candidates(query_vector, c=n)
        closest = set(neighbors[:k])
        for limit in limits:
            t0 = time.time()
            neighbors, candidates = t.query_num_candidates(query_vector,
                                                           c=limit)
            T = time.time() - t0
            toplist = neighbors[:k]

            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T
            candidate_sum[limit] = candidate_sum.get(limit, 0) + candidates

        for limit in limits:
            print 'limit: %.8f precision: %6.2f%% avg time: %.6fs avg_candiates: %.1f' % (
                limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] /
                (i + 1), candidate_sum[limit] / (i + 1))

    return prec_sum, time_sum, candidate_sum
import numpy as np

from lsh_forest import LSH_forest
import matplotlib.pyplot as plt

from sklearn.metrics import euclidean_distances

"""
Create a dummy 2 dimensional data set for the visualization.
Create LSH forest with a single tree and build index with the dummy data.
"""
samples = 10000
dummy_x = np.random.rand(samples,2)
lshf = LSH_forest(number_of_trees=1)
lshf.build_index(dummy_x)

#Get candidate neighbors for a query
point = dummy_x[np.random.randint(0,samples)]
#point = np.random.rand(1,2)[0] #Use this if a random vector is required
neighbors, candidates = lshf.query_candidates(point, m=20)

#Plot candidate distribution with the query
x = dummy_x[[candidates],0]
y = dummy_x[[candidates],1]
plt.scatter(x, y, s=10, c='g')
plt.scatter(point[0], point[1], s=20, c='r')
plt.ylabel('Y')
plt.xlabel('X')
plt.title("Candidates distribution")
      
plt.show()