def test_serialization_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) # Serialize and deserialize tree = pickle.loads(pickle.dumps(tree)) precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] nns = tree.query(x_test, 10)[:10] assert (nns < X_train.shape[0]).all() precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def test_find_self(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) nodes = {k: set(v) for k, v in tree.get_leaf_nodes()} for i, x_train in enumerate(X_train): nns = tree.query(x_train, 10)[:10] assert nns[0] == i point_codes = tree.encode(x_train) for code in point_codes: assert i in nodes[code] tree = pickle.loads(pickle.dumps(tree)) nodes = {k: set(v) for k, v in tree.get_leaf_nodes()} for i, x_train in enumerate(X_train): nns = tree.query(x_train, 10)[:10] assert nns[0] == i point_codes = tree.encode(x_train) for code in point_codes: assert i in nodes[code]
def test_candidates_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.12), (10, 0.2), (50, 0.5), (80, 0.6)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] check_nns = tree.get_candidates(x_test, 100000) assert len(check_nns) == len(set(check_nns)) assert -1 not in check_nns assert (check_nns < X_train.shape[0]).all() nns = tree.get_candidates(x_test, 10)[:10] assert (nns < X_train.shape[0]).all() precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def test_sample_training(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) # Fit on quarter of data X_sample = X_train[:X_train.shape[0] / 4] tree.fit(X_sample) # Clear and index everything tree.clear() for i, x in enumerate(X_train): tree.index(i, x) tree._X = X_train precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] nns = tree.query(x_test, 10)[:10] precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def lvnn(fp, nt=3, k=5, iter=5, leaves=50): nn = np.zeros((fp.shape[0], k, 2)) - 1 print(' start Tree build') model = RPForest(leaf_size=leaves, no_trees=nt) model.fit(fp) for i in range(0, fp.shape[0]): nn[i, :, 0] = model.query(fp[i, ], k) t = 0 while t < iter: t += 1 old_nn = nn for i in range(0, fp.shape[0]): h = set() for j in range(0, k): ji = old_nn[i, j, 0] for l in range(0, k): li = old_nn[ji, l, 0] d = -np.linalg.norm(fp[i, :] - fp[li, :]) h.update([(li, d)]) nn[i, :, :] = np.array(nsmallest(k, h)) csr = np.zeros((fp.shape[0] * k, 3)) l = 0 for i in range(fp.shape[0]): for j in range(k): csr[l, 0] = i csr[l, 1] = nn[i, j, 0] csr[l, 2] = nn[i, j, 1] l = l + 1 return csr
def test_max_size(): X_train, X_test = _get_mnist_data() tree = RPForest(leaf_size=10, no_trees=10) tree.fit(X_train) for leaf_code, leaf_indices in tree.get_leaf_nodes(): assert len(leaf_indices) < 10
class RPForest(BaseANN): def __init__(self, leaf_size, n_trees): from rpforest import RPForest self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) self._model = RPForest(leaf_size=leaf_size, no_trees=n_trees) def fit(self, X): self._model.fit(X) def query(self, v, n): return self._model.query(v, n)
def test_multiple_fit_calls(): X_train, X_test = _get_mnist_data() tree = RPForest(leaf_size=10, no_trees=10) tree.fit(X_train) assert len(tree.trees) == 10 tree.fit(X_train) assert len(tree.trees) == 10
def test_encoding_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) for x_train in X_train: encodings_0 = tree.encode(x_train) encodings_1 = tree.encode(x_train) assert encodings_0 == encodings_1 tree = pickle.loads(pickle.dumps(tree)) for x_train in X_train: encodings_0 = tree.encode(x_train) encodings_1 = tree.encode(x_train) assert encodings_0 == encodings_1
def get_algos(m, save_index): algos = { 'lshf': [ LSHF(m, 5, 10), LSHF(m, 5, 20), LSHF(m, 10, 20), LSHF(m, 10, 50), LSHF(m, 20, 100) ], 'flann': [ FLANN(m, 0.2), FLANN(m, 0.5), FLANN(m, 0.7), FLANN(m, 0.8), FLANN(m, 0.9), FLANN(m, 0.95), FLANN(m, 0.97), FLANN(m, 0.98), FLANN(m, 0.99), FLANN(m, 0.995) ], 'panns': [ PANNS(m, 5, 20), PANNS(m, 10, 10), PANNS(m, 10, 50), PANNS(m, 10, 100), PANNS(m, 20, 100), PANNS(m, 40, 100) ], 'annoy': [ Annoy(m, n_trees, search_k) for n_trees in [100, 200, 400] for search_k in [ 100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, 100000, 200000, 400000 ] ], 'nearpy': [ NearPy(m, 10, 5), NearPy(m, 10, 10), NearPy(m, 10, 20), NearPy(m, 10, 40), # NearPy(m, 10, 100), NearPy(m, 12, 5), NearPy(m, 12, 10), NearPy(m, 12, 20), NearPy(m, 12, 40), # NearPy(m, 12, 100), NearPy(m, 14, 5), NearPy(m, 14, 10), NearPy(m, 14, 20), NearPy(m, 14, 40), # NearPy(m, 14, 100), NearPy(m, 16, 5), NearPy(m, 16, 10), NearPy(m, 16, 15), NearPy(m, 16, 20), NearPy(m, 16, 25), NearPy(m, 16, 30), NearPy(m, 16, 40) ], #, NearPy(m, 16, 50), NearPy(m, 16, 70), NearPy(m, 16, 90), NearPy(m, 16, 120), NearPy(m, 16, 150)], 'bruteforce': [BruteForce(m)], 'bruteforce-blas': [BruteForceBLAS(m)], 'ball': [ BallTree(m, 10), BallTree(m, 20), BallTree(m, 40), BallTree(m, 100), BallTree(m, 200), BallTree(m, 400), BallTree(m, 1000) ], 'kd': [ KDTree(m, 10), KDTree(m, 20), KDTree(m, 40), KDTree(m, 100), KDTree(m, 200), KDTree(m, 400), KDTree(m, 1000) ], # START: Non-Metric Space Library (nmslib) entries 'bruteforce0(nmslib)': [NmslibNewIndex(m, 'seq_search', ['copyMem=0'])], # We don't need copyMem=1 now, because the new Python wrapper already re-creates data points. #'bruteforce1(nmslib)': [NmslibNewIndex(m, 'seq_search', ['copyMem=1'])], 'BallTree(nmslib)': [], 'hnsw(nmslib)': [], 'SW-graph(nmslib)': [], 'faiss': [ Faiss(m, l, p) for l in [5, 10, 20, 50, 100, 200, 400, 800, 1600] for p in [1, 2, 3, 4, 5, 8, 10, 20, 50, 100, 200] if l >= p ] } for r in [ 0.99, 0.97, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ]: algos['BallTree(nmslib)'].append( NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=%f' % r])) if m == 'euclidean': # kgraph kgraph_preset = { 'reverse': -1 } kgraph_Ps = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] algos['kgraph'] = [ KGraph(m, P, kgraph_preset, save_index) for P in kgraph_Ps ] # nmslib algorithms # Only works for euclidean distance MsPostsEfs = [(32, 2, [ 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, 300, 400 ]), (20, 2, [2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120, 200, 400]), (12, 0, [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120]), (4, 0, [1, 2, 5, 10, 20, 30, 50, 70, 90, 120]), (8, 0, [1, 2, 5, 10, 20, 30, 50, 70, 90, 120, 160])] for oneCase in MsPostsEfs: for ef in oneCase[2]: algos['hnsw(nmslib)'].append( NmslibReuseIndex(m, 'hnsw', [ 'M=%d' % oneCase[0], 'post=%d' % oneCase[1], 'efConstruction=400' ], save_index, ['ef=%d' % ef])) algos['MP-lsh(lshkit)'] = [] for r in [ 0.99, 0.97, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ]: algos['MP-lsh(lshkit)'].append( NmslibNewIndex(m, 'lsh_multiprobe', [ 'desiredRecall=%f' % r, 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ])) NNsAndEfs = [(10, [800, 400, 200, 100, 50, 30, 20, 15, 10]), (5, [30, 25, 20, 15, 10, 5, 4, 3, 2, 1])] for oneCase in NNsAndEfs: for ef in oneCase[1]: algos['SW-graph(nmslib)'].append( NmslibReuseIndex( m, 'sw-graph', [ 'NN=%d' % oneCase[0], 'efConstruction=400', 'initIndexAttempts=1' ], save_index, ['efSearch=%d' % ef, 'initSearchAttempts=1'])) # END: Non-Metric Space Library (nmslib) entries if m == 'angular': # kgraph kgraph_preset = { 'reverse': -1, 'K': 200, 'L': 300, 'S': 20 } kgraph_Ps = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] algos['kgraph'] = [ KGraph(m, P, kgraph_preset, save_index) for P in kgraph_Ps ] # nmslib algorithms MsPostsEfs = [(48, 2, [ 50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, 1400, 1600, 2000 ]), (32, 2, [ 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000 ]), (20, 0, [2, 5, 10, 15, 20, 30, 40, 50, 70, 80]), (12, 0, [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80])] for oneCase in MsPostsEfs: for ef in oneCase[2]: algos['hnsw(nmslib)'].append( NmslibReuseIndex(m, 'hnsw', [ 'M=%d' % oneCase[0], 'post=%d' % oneCase[1], 'efConstruction=800' ], save_index, ['ef=%d' % ef])) NNsAndEfs = [ (30, [700, 650, 550, 450, 350, 275, 200, 150, 120, 80, 50, 30]), (15, [80, 50, 30, 20]), (3, [120, 80, 60, 40, 20, 10, 8, 4, 2]) ] for oneCase in NNsAndEfs: for ef in oneCase[1]: algos['SW-graph(nmslib)'].append( NmslibReuseIndex( m, 'sw-graph', [ 'NN=%d' % oneCase[0], 'efConstruction=800', 'initIndexAttempts=1' ], save_index, ['efSearch=%d' % ef, 'initSearchAttempts=1'])) # END: Non-Metric Space Library (nmslib) entries # RPForest only works for cosine algos['rpforest'] = [ RPForest(leaf_size, n_trees) for n_trees in [3, 5, 10, 20, 40, 100, 200, 400] for leaf_size in [3, 5, 10, 20, 40, 100, 200, 400] ] L = [] x = 1 while True: L.append(x) if x >= 1400: break x = int(math.ceil(x * 1.1)) algos['falconn'] = [FALCONN(m, 16, l, l) for l in L] return algos
def get_algos(m): algos = { 'lshf': [ LSHF(m, 5, 10), LSHF(m, 5, 20), LSHF(m, 10, 20), LSHF(m, 10, 50), LSHF(m, 20, 100) ], 'flann': [ FLANN(m, 0.2), FLANN(m, 0.5), FLANN(m, 0.7), FLANN(m, 0.8), FLANN(m, 0.9), FLANN(m, 0.95), FLANN(m, 0.97), FLANN(m, 0.98), FLANN(m, 0.99), FLANN(m, 0.995) ], 'panns': [ PANNS(m, 5, 20), PANNS(m, 10, 10), PANNS(m, 10, 50), PANNS(m, 10, 100), PANNS(m, 20, 100), PANNS(m, 40, 100) ], 'annoy': [ Annoy(m, n_trees, search_k) for n_trees in [100, 200, 400] for search_k in [ 100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, 100000, 200000, 400000 ] ], 'nearpy': [ NearPy(m, 10, 5), NearPy(m, 10, 10), NearPy(m, 10, 20), NearPy(m, 10, 40), # NearPy(m, 10, 100), NearPy(m, 12, 5), NearPy(m, 12, 10), NearPy(m, 12, 20), NearPy(m, 12, 40), # NearPy(m, 12, 100), NearPy(m, 14, 5), NearPy(m, 14, 10), NearPy(m, 14, 20), NearPy(m, 14, 40), # NearPy(m, 14, 100), NearPy(m, 16, 5), NearPy(m, 16, 10), NearPy(m, 16, 15), NearPy(m, 16, 20), NearPy(m, 16, 25), NearPy(m, 16, 30), NearPy(m, 16, 40) ], #, NearPy(m, 16, 50), NearPy(m, 16, 70), NearPy(m, 16, 90), NearPy(m, 16, 120), NearPy(m, 16, 150)], 'kgraph': [ KGraph(m, 20), KGraph(m, 50), KGraph(m, 100), KGraph(m, 200), KGraph(m, 500), KGraph(m, 1000), KGraph(m, 2000), KGraph(m, 4000), KGraph(m, 10000) ], 'bruteforce': [BruteForce(m)], 'ball': [ BallTree(m, 10), BallTree(m, 20), BallTree(m, 40), BallTree(m, 100), BallTree(m, 200), BallTree(m, 400), BallTree(m, 1000) ], 'kd': [ KDTree(m, 10), KDTree(m, 20), KDTree(m, 40), KDTree(m, 100), KDTree(m, 200), KDTree(m, 400), KDTree(m, 1000) ], # START: Non-Metric Space Library (nmslib) entries 'bruteforce0(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=0'])], 'bruteforce1(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=1'])], 'BallTree(nmslib)': [ Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.99']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.95']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.90']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.85']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.8']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.7']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.6']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.5']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.4']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.3']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.2']), Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.1']), ], 'SW-graph(nmslib)': [ Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=48']), Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=32']), Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=16']), Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=8']), Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=4']), Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=17', 'initIndexAttempts=4', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=14', 'initIndexAttempts=4', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=11', 'initIndexAttempts=5', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=8', 'initIndexAttempts=5', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=5', 'initIndexAttempts=5', 'initSearchAttempts=2']), Nmslib(m, 'small_world_rand', ['NN=3', 'initIndexAttempts=5', 'initSearchAttempts=2']), ] } if m == 'euclidean': # Only works for euclidean distance algos['MP-lsh(lshkit)'] = [ Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.99', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.97', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.95', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.90', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.85', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib(m, 'lsh_multiprobe', [ 'desiredRecall=0.80', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.7', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.6', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.5', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.4', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.3', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.2', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), Nmslib( m, 'lsh_multiprobe', ['desiredRecall=0.1', 'H=1200001', 'T=10', 'L=50', 'tuneK=10' ]), ] # END: Non-Metric Space Library (nmslib) entries if m == 'angular': # RPForest only works for cosine algos['rpforest'] = [ RPForest(leaf_size, n_trees) for n_trees in [3, 5, 10, 20, 40, 100, 200, 400] for leaf_size in [3, 5, 10, 20, 40, 100, 200, 400] ] L = [] x = 1 while True: L.append(x) if x >= 1400: break x = int(math.ceil(x * 1.1)) algos['falconn'] = [FALCONN(m, 16, l, l) for l in L] return algos
sep="") fq.close() print('time query:', end_query - start_query) print('accuracy:', accuracy / len(xq) / k) quit() a = [350] b = [350] for leaf_size in a: for no_trees in b: fq = open('fq_RPForest.txt', 'a') if X.dtype != np.double: X = np.array(X).astype(np.double) t = RPForest(leaf_size, no_trees) t.fit(X) start_query = time.time() accuracy = 0 for i in range(len(xq)): v = xq[i] if v.dtype != np.double: v = np.array(v).astype(np.double) ans = t.query(v, k) for x in ans: if x in gt[i]: accuracy += 1 end_query = time.time() print(leaf_size, no_trees) print(round(accuracy / len(xq) / k, 4),
for k in xrange(hash_counts): redis_storage.store_hash_configuration(lshash[k]) ## RPFOREST TEST from rpforest import RPForest leaf_size = 5 n_trees = 20 name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) model = RPForest(leaf_size=leaf_size, no_trees=n_trees) #fitting features = features.copy(order='C') #something related to Cython error model.fit(features) model.clear() #indexing for i, x in enumerate(features): t = Timer() with t: model.index(dict_feat[i], x.tolist()) #querying for i in range(features.shape[0]): t = Timer() with t: results = model.get_candidates(features[i]) print 'queried', dict_feat[i], 'results', results
def __init__(self, leaf_size, n_trees): from rpforest import RPForest self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) self._model = RPForest(leaf_size=leaf_size, no_trees=n_trees)
def get_algos(m): algos = { 'lshf': [LSHF(m, 5, 10), LSHF(m, 5, 20), LSHF(m, 10, 20), LSHF(m, 10, 50), LSHF(m, 20, 100)], 'flann': [FLANN(m, 0.2), FLANN(m, 0.5), FLANN(m, 0.7), FLANN(m, 0.8), FLANN(m, 0.9), FLANN(m, 0.95), FLANN(m, 0.97), FLANN(m, 0.98), FLANN(m, 0.99), FLANN(m, 0.995)], 'panns': [PANNS(m, 5, 20), PANNS(m, 10, 10), PANNS(m, 10, 50), PANNS(m, 10, 100), PANNS(m, 20, 100), PANNS(m, 40, 100)], 'annoy': [Annoy(m, n_trees, search_k) for n_trees in [100, 200, 400] for search_k in [100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, 100000, 200000, 400000]], 'nearpy': [NearPy(m, 10, 5), NearPy(m, 10, 10), NearPy(m, 10, 20), NearPy(m, 10, 40), # NearPy(m, 10, 100), NearPy(m, 12, 5), NearPy(m, 12, 10), NearPy(m, 12, 20), NearPy(m, 12, 40), # NearPy(m, 12, 100), NearPy(m, 14, 5), NearPy(m, 14, 10), NearPy(m, 14, 20), NearPy(m, 14, 40), # NearPy(m, 14, 100), NearPy(m, 16, 5), NearPy(m, 16, 10), NearPy(m, 16, 15), NearPy(m, 16, 20), NearPy(m, 16, 25), NearPy(m, 16, 30), NearPy(m, 16, 40)], #, NearPy(m, 16, 50), NearPy(m, 16, 70), NearPy(m, 16, 90), NearPy(m, 16, 120), NearPy(m, 16, 150)], 'bruteforce': [BruteForce(m)], 'ball': [BallTree(m, 10), BallTree(m, 20), BallTree(m, 40), BallTree(m, 100), BallTree(m, 200), BallTree(m, 400), BallTree(m, 1000)], 'kd': [KDTree(m, 10), KDTree(m, 20), KDTree(m, 40), KDTree(m, 100), KDTree(m, 200), KDTree(m, 400), KDTree(m, 1000)], # START: Non-Metric Space Library (nmslib) entries 'bruteforce0(nmslib)': [NmslibNewIndex(m, 'seq_search', ['copyMem=0'])], # We don't need copyMem=1 now, because the new Python wrapper already re-creates data points. #'bruteforce1(nmslib)': [NmslibNewIndex(m, 'seq_search', ['copyMem=1'])], 'BallTree(nmslib)': [ NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.99']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.95']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.90']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.85']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.8']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.7']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.6']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.5']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.4']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.3']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.2']), NmslibNewIndex(m, 'vptree', ['tuneK=10', 'desiredRecall=0.1']), ], 'hnsw(nmslib)': [], 'SW-graph(nmslib)' :[] } if m == 'euclidean': # kgraph kgraph_preset ={'reverse':-1}; kgraph_Ps = [10,20,30,40,50,60,70,80,90,100] algos['kgraph'] = [KGraph(m, P, kgraph_preset) for P in kgraph_Ps] # nmslib algorithms # Only works for euclidean distance MsAndEfs=[ [32,[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, 300, 400]], [4,[1, 2, 5, 10, 20, 30, 50, 70, 90, 120]], [8,[1,2,5,10,20, 30, 50, 70, 90, 120, 160, ]], [20, [2, 5, 10, 15, 20, 30, 40, 50, 70, 80,120,200,400]], [12, [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80,120]]] for MsAndEf in MsAndEfs: for ef in MsAndEf[1]: algos['hnsw(nmslib)'].append(NmslibReuseIndex(m, 'hnsw', ['M='+str(MsAndEf[0]), 'efConstruction=400'], ['ef=' + str(ef), 'searchMethod=3'])) algos['MP-lsh(lshkit)'] = [ NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.99','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.97','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.95','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.90','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.85','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.80','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.7','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.6','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.5','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.4','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.3','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.2','H=1200001','T=10','L=50','tuneK=10']), NmslibNewIndex(m, 'lsh_multiprobe', ['desiredRecall=0.1','H=1200001','T=10','L=50','tuneK=10']), ] algos['SW-graph(nmslib)'] = [ NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=800', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=400', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=200', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=100', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=50', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=30', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=20', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=15', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=10', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=10', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=30', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=25', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=20', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=15', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=10', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=5', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=4', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=3', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=2', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=5', 'efConstruction=400', 'initIndexAttempts=1'], ['efSearch=1', 'initSearchAttempts=1']), ] # END: Non-Metric Space Library (nmslib) entries if m == 'angular': # kgraph kgraph_preset ={'reverse':-1, 'K':200, 'L':300, 'S':20}; kgraph_Ps = [10,20,30,40,50,60,70,80,90,100] algos['kgraph'] = [KGraph(m, P, kgraph_preset) for P in kgraph_Ps] # nmslib algorithms MsAndEfs=[ [32,[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, 300, 400, 600, 700, 800, 1000, 1200, 1400,1600, 2000]], [64,[10, 30, 50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, 1400, 1600, 2000]], [96,[10, 30, 50, 70, 90, 120, 160, 200, 400, 700, 1000, 1400,1600, 2000]], [20, [2, 5, 10, 15, 20, 30, 40, 50, 70, 80]], [12, [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]] for MsAndEf in MsAndEfs: for ef in MsAndEf[1]: algos['hnsw(nmslib)'].append(NmslibReuseIndex(m, 'hnsw', ['M='+str(MsAndEf[0]), 'efConstruction=1600'], ['ef=' + str(ef), 'searchMethod=4'])) # RPForest only works for cosine algos['rpforest'] = [RPForest(leaf_size, n_trees) for n_trees in [3, 5, 10, 20, 40, 100, 200, 400] for leaf_size in [3, 5, 10, 20, 40, 100, 200, 400]] L = [] x = 1 while True: L.append(x) if x >= 1400: break x = int(math.ceil(x * 1.1)) algos['falconn'] = [FALCONN(m, 16, l, l) for l in L] # START: Non-Metric Space Library (nmslib) entries algos['SW-graph(nmslib)'] = [ NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=700', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=650', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=550', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=450', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=350', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=275', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=200', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=150', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=120', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=80', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=50', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=30', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=30', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=15', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=80', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=15', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=50', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=15', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=30', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=15', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=20', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=120', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=80', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=60', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=40', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=20', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=10', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=8', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=4', 'initSearchAttempts=1']), NmslibReuseIndex(m, 'sw-graph', ['NN=3', 'efConstruction=1600', 'initIndexAttempts=1'], ['efSearch=2', 'initSearchAttempts=1']), ] # END: Non-Metric Space Library (nmslib) entries return algos
def _get_random_projection_forest(self, leaf_size=20, no_trees=10): self.embed_feat = self.pca.transform(self.feat) rpf = RPForest(leaf_size=leaf_size, no_trees=no_trees) rpf.fit(self.embed_feat) return rpf