def test_sample_training(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) # Fit on quarter of data X_sample = X_train[:X_train.shape[0] / 4] tree.fit(X_sample) # Clear and index everything tree.clear() for i, x in enumerate(X_train): tree.index(i, x) tree._X = X_train precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] nns = tree.query(x_test, 10)[:10] precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def test_candidates_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.12), (10, 0.2), (50, 0.5), (80, 0.6)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] check_nns = tree.get_candidates(x_test, 100000) assert len(check_nns) == len(set(check_nns)) assert -1 not in check_nns assert (check_nns < X_train.shape[0]).all() nns = tree.get_candidates(x_test, 10)[:10] assert (nns < X_train.shape[0]).all() precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def test_serialization_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) # Serialize and deserialize tree = pickle.loads(pickle.dumps(tree)) precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] nns = tree.query(x_test, 10)[:10] assert (nns < X_train.shape[0]).all() precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
def lvnn(fp, nt=3, k=5, iter=5, leaves=50): nn = np.zeros((fp.shape[0], k, 2)) - 1 print(' start Tree build') model = RPForest(leaf_size=leaves, no_trees=nt) model.fit(fp) for i in range(0, fp.shape[0]): nn[i, :, 0] = model.query(fp[i, ], k) t = 0 while t < iter: t += 1 old_nn = nn for i in range(0, fp.shape[0]): h = set() for j in range(0, k): ji = old_nn[i, j, 0] for l in range(0, k): li = old_nn[ji, l, 0] d = -np.linalg.norm(fp[i, :] - fp[li, :]) h.update([(li, d)]) nn[i, :, :] = np.array(nsmallest(k, h)) csr = np.zeros((fp.shape[0] * k, 3)) l = 0 for i in range(fp.shape[0]): for j in range(k): csr[l, 0] = i csr[l, 1] = nn[i, j, 0] csr[l, 2] = nn[i, j, 1] l = l + 1 return csr
def test_find_self(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) nodes = {k: set(v) for k, v in tree.get_leaf_nodes()} for i, x_train in enumerate(X_train): nns = tree.query(x_train, 10)[:10] assert nns[0] == i point_codes = tree.encode(x_train) for code in point_codes: assert i in nodes[code] tree = pickle.loads(pickle.dumps(tree)) nodes = {k: set(v) for k, v in tree.get_leaf_nodes()} for i, x_train in enumerate(X_train): nns = tree.query(x_train, 10)[:10] assert nns[0] == i point_codes = tree.encode(x_train) for code in point_codes: assert i in nodes[code]
def test_max_size(): X_train, X_test = _get_mnist_data() tree = RPForest(leaf_size=10, no_trees=10) tree.fit(X_train) for leaf_code, leaf_indices in tree.get_leaf_nodes(): assert len(leaf_indices) < 10
class RPForest(BaseANN): def __init__(self, leaf_size, n_trees): from rpforest import RPForest self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) self._model = RPForest(leaf_size=leaf_size, no_trees=n_trees) def fit(self, X): self._model.fit(X) def query(self, v, n): return self._model.query(v, n)
def test_multiple_fit_calls(): X_train, X_test = _get_mnist_data() tree = RPForest(leaf_size=10, no_trees=10) tree.fit(X_train) assert len(tree.trees) == 10 tree.fit(X_train) assert len(tree.trees) == 10
def test_encoding_mnist(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) tree.fit(X_train) for x_train in X_train: encodings_0 = tree.encode(x_train) encodings_1 = tree.encode(x_train) assert encodings_0 == encodings_1 tree = pickle.loads(pickle.dumps(tree)) for x_train in X_train: encodings_0 = tree.encode(x_train) encodings_1 = tree.encode(x_train) assert encodings_0 == encodings_1
def _get_random_projection_forest(self, leaf_size=20, no_trees=10): self.embed_feat = self.pca.transform(self.feat) rpf = RPForest(leaf_size=leaf_size, no_trees=no_trees) rpf.fit(self.embed_feat) return rpf
## RPFOREST TEST from rpforest import RPForest leaf_size = 5 n_trees = 20 name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) model = RPForest(leaf_size=leaf_size, no_trees=n_trees) #fitting features = features.copy(order='C') #something related to Cython error model.fit(features) model.clear() #indexing for i, x in enumerate(features): t = Timer() with t: model.index(dict_feat[i], x.tolist()) #querying for i in range(features.shape[0]): t = Timer() with t: results = model.get_candidates(features[i]) print 'queried', dict_feat[i], 'results', results
fq.close() print('time query:', end_query - start_query) print('accuracy:', accuracy / len(xq) / k) quit() a = [350] b = [350] for leaf_size in a: for no_trees in b: fq = open('fq_RPForest.txt', 'a') if X.dtype != np.double: X = np.array(X).astype(np.double) t = RPForest(leaf_size, no_trees) t.fit(X) start_query = time.time() accuracy = 0 for i in range(len(xq)): v = xq[i] if v.dtype != np.double: v = np.array(v).astype(np.double) ans = t.query(v, k) for x in ans: if x in gt[i]: accuracy += 1 end_query = time.time() print(leaf_size, no_trees) print(round(accuracy / len(xq) / k, 4), ": ",