def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh_forest = LSH_forest(number_of_trees=10) lsh_forest.build_index(X) return lsh_forest
def precision_test_candidates_LSH_F_random(X, limits = [10, 100, 1000], iterations = 100): f = X.shape[1] n = X.shape[0] t = LSH_forest(number_of_trees=10) t.build_index(X) limits = limits k = 10 prec_sum = {} prec_n = iterations time_sum = {} candidate_sum = {} for i in xrange(prec_n): query_vector = np.random.random(size=f) print 'finding nbs for a random vector' neighbors, candidates = t.query_num_candidates(query_vector, c=n) closest = set(neighbors[:k]) for limit in limits: t0 = time.time() neighbors, candidates= t.query_num_candidates(query_vector, c=limit) T = time.time() - t0 toplist = neighbors[:k] found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T candidate_sum[limit] = candidate_sum.get(limit, 0) + candidates for limit in limits: print 'limit: %.8f precision: %6.2f%% avg time: %.6fs avg_candiates: %.1f' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1), candidate_sum[limit]/ (i+1)) return prec_sum, time_sum, candidate_sum
def precision_test_candidates_LSH_F_random(X, limits=[10, 100, 1000], iterations=100): f = X.shape[1] n = X.shape[0] t = LSH_forest(number_of_trees=10) t.build_index(X) limits = limits k = 10 prec_sum = {} prec_n = iterations time_sum = {} candidate_sum = {} for i in xrange(prec_n): query_vector = np.random.random(size=f) print 'finding nbs for a random vector' neighbors, candidates = t.query_num_candidates(query_vector, c=n) closest = set(neighbors[:k]) for limit in limits: t0 = time.time() neighbors, candidates = t.query_num_candidates(query_vector, c=limit) T = time.time() - t0 toplist = neighbors[:k] found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T candidate_sum[limit] = candidate_sum.get(limit, 0) + candidates for limit in limits: print 'limit: %.8f precision: %6.2f%% avg time: %.6fs avg_candiates: %.1f' % ( limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1), candidate_sum[limit] / (i + 1)) return prec_sum, time_sum, candidate_sum
import numpy as np from lsh_forest import LSH_forest import matplotlib.pyplot as plt from sklearn.metrics import euclidean_distances """ Create a dummy 2 dimensional data set for the visualization. Create LSH forest with a single tree and build index with the dummy data. """ samples = 10000 dummy_x = np.random.rand(samples,2) lshf = LSH_forest(number_of_trees=1) lshf.build_index(dummy_x) #Get candidate neighbors for a query point = dummy_x[np.random.randint(0,samples)] #point = np.random.rand(1,2)[0] #Use this if a random vector is required neighbors, candidates = lshf.query_candidates(point, m=20) #Plot candidate distribution with the query x = dummy_x[[candidates],0] y = dummy_x[[candidates],1] plt.scatter(x, y, s=10, c='g') plt.scatter(point[0], point[1], s=20, c='r') plt.ylabel('Y') plt.xlabel('X') plt.title("Candidates distribution") plt.show()