def test_celeba_embedding(self): PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json'))) EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json'))) INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann'))) NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw'))) TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json'))) with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = np.array(json.load(fp)) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) with open(TEST_CASES_FILENAME, 'r') as fp: print('Loading test_cases') test_cases = json.load(fp) annoy = AnnoyIndex(len(embeddings[0])) annoy_index = annoy.load(INDEX_FILENAME) print('building nsw index') nsw_index = PyNSW('l2') print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for node in tqdm(nodes): nsw_index.nn_insert(node, 5, 1000) n, k_annoy, k_nsw = 0, 0, 0 print('Calculating accuracy on CelebA') for tk in test_cases: vector = embeddings[int(tk['embedding_index'])] closest_paths_real = tk['closest_paths_real'] closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)] closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)] assert len(closest_paths_real) == 10 assert len(closest_paths_annoy) == 10 assert len(closest_paths_nsw) == 10 n += 10 k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real)) k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real)) print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n)) print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
def _get_index(self, dataset): vectors_fn = join(os.getenv('ACCURACY_TEST_DATA_PATH', abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'test'))), dataset + '.hdf5') if not exists(vectors_fn): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) index_fn = os.path.join(dataset + '.annoy') dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) print('building nsw index') nsw = PyNSW('l2') for i in trange(dataset_f['train'].shape[0]): v = dataset_f['train'][i] nsw.nn_insert(PyNode(str(i), v), 1, 100) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building annoy index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def test_save_load(self): print('test_save_load') nsw = self._create_index() index_path = tempfile.NamedTemporaryFile(delete=False).name nsw.save(index_path) empty_nsw = PyNSW('l2') empty_nsw.load(index_path) # compare original and loaded index on different number of iterations NUM_ITERS = 1 for num_iter in range(1, NUM_ITERS + 1): for node in self.nodes: self.assertEqual( nsw.nn_search(node, num_iter, 3, random_seed=1334), empty_nsw.nn_search(node, num_iter, 3, random_seed=1334))
def create_index(index_path): index = PyNSW('l2') if os.path.exists(index_path): index.load(index_path) else: with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = json.load(fp) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for idx, node in enumerate(nodes): if idx % 500 == 0: print('{} nodes inserted'.format(idx)) index.nn_insert(node, 3, 1000) index.save(index_path)
def test_fail(self): fake_dist_type = 'l3' with self.assertRaises(TypeError) as context: PyNSW(fake_dist_type) self.assertEqual('Unknown distance type: {}'.format(fake_dist_type), context.exception.message)
def test_l2(self): nsw = PyNSW('l2') self.assertEqual('l2', nsw.dist_type)
def test_l1(self): nsw = PyNSW('l1') self.assertEqual('l1', nsw.dist_type)
from tqdm import tqdm from python.index import create_node, PyNSW PATHS_JSON = getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths.json'))) EMBEDDING_JSON = getenv( 'EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings.json'))) if __name__ == '__main__': with open(PATHS_JSON) as fp: paths = json.load(fp) with open(EMBEDDING_JSON) as fp: embeddings = json.load(fp) nodes = [ create_node(path, vector) for path, vector in zip(paths, embeddings) ] nsw = PyNSW('l2') for node in tqdm(nodes): nsw.nn_insert(node, 1, 100) random_vector = embeddings[100] print(paths[100]) print(random_vector) neighbors = nsw.nn_search(create_node('kek', random_vector), 5, 3) print(neighbors)
from python.index import create_node, PyNSW from tqdm import tqdm NSW_INDEX_FILENAME = os.getenv( 'NSW_INDEX_FILENAME', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw'))) PATHS_JSON = os.getenv( 'PATHS_JSON', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'paths_celeba.json'))) EMBEDDING_JSON = os.getenv( 'EMBEDDING_JSON', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'embeddings_celeba.json'))) index = PyNSW('l2') with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = json.load(fp) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for node in tqdm(nodes): index.nn_insert(node, 3, 10) index.save(NSW_INDEX_FILENAME)
def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1): nsw = PyNSW('l2') # connect each node to `num_neighbors` other nodes for i, node in enumerate(self.nodes): nsw.nn_insert(node, num_iters, num_neighbors, random_seed=1334 + i) return nsw
def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1): nsw = PyNSW('l2') # connect each node to `num_neighbors` other nodes for node in self.nodes: nsw.nn_insert(node, num_iters, num_neighbors) return nsw
import numpy as np from python.index import PyNode, PyDistance_l1, PyDistance_l2, PyNSW from os.path import dirname, join, abspath from os import getenv import json import jsonpickle nsw = PyNSW('l2') PATHS_JSON = getenv('PATHS_JSON', abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'paths.json'))) EMBEDDING_JSON = getenv('EMBEDDING_JSON', abspath(join(abspath(dirname(abspath(dirname(__file__)))), 'data', 'embeddings.json'))) with open(PATHS_JSON, 'r') as fp: PATHS = json.load(fp) with open(EMBEDDING_JSON, 'r') as fp: EMBEDDINGS = json.load(fp) nodes = [PyNode(path, vector) for path, vector in zip(PATHS, EMBEDDINGS)] for node in nodes: nsw.nn_insert(node) random_vector = EMBEDDINGS[100] print(random_vector) neighbors = nsw.nn_search(PyNode('1', random_vector), 5, 3) print(neighbors)