def test_celeba_embedding(self): PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json'))) EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json'))) INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann'))) NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw'))) TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json'))) with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = np.array(json.load(fp)) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) with open(TEST_CASES_FILENAME, 'r') as fp: print('Loading test_cases') test_cases = json.load(fp) annoy = AnnoyIndex(len(embeddings[0])) annoy_index = annoy.load(INDEX_FILENAME) print('building nsw index') nsw_index = PyNSW('l2') print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for node in tqdm(nodes): nsw_index.nn_insert(node, 5, 1000) n, k_annoy, k_nsw = 0, 0, 0 print('Calculating accuracy on CelebA') for tk in test_cases: vector = embeddings[int(tk['embedding_index'])] closest_paths_real = tk['closest_paths_real'] closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)] closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)] assert len(closest_paths_real) == 10 assert len(closest_paths_annoy) == 10 assert len(closest_paths_nsw) == 10 n += 10 k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real)) k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real)) print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n)) print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
class PyDistanceTests(unittest.TestCase): node1 = create_node('kek', [1, 2, 3]) node2 = create_node('lol', [1, 2, 5]) def test_l1(self): self.assertEqual(2, PyDistance_l1()(self.node1, self.node2)) def test_l2(self): self.assertEqual(2, PyDistance_l2()(self.node1, self.node2))
def get_closest_vector(): content = request.get_json(silent=True) vector = content['vector'] closest_celeb_filename = index.nn_search(create_node('1', vector), 1, 1)[0][1] response = {'closest_celeb_filename': closest_celeb_filename} response_pickled = jsonpickle.encode(response) return Response(response=response_pickled, status=200, mimetype="application/json")
class PyNSWTests(unittest.TestCase): nodes = [create_node(str(i), [i**2]) for i in range(NUM_NODES)] def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1): nsw = PyNSW('l2') # connect each node to `num_neighbors` other nodes for node in self.nodes: nsw.nn_insert(node, num_iters, num_neighbors) return nsw def test_search_full(self): nsw = self._create_index() # check that the node is closest to itself for node in self.nodes: neighbors = nsw.nn_search(node, 1, 1) self.assertEqual(node.file_path, neighbors[0][1]) def test_search_half(self): nsw = self._create_index(num_neighbors=(NUM_NODES / 2)) # count top3 accuracy depending on number of iterations NUM_ITERS = 3 count = np.zeros(NUM_ITERS) for num_iter in range(1, NUM_ITERS + 1): accuracy = 0.0 for node in self.nodes: neighbors = nsw.nn_search(node, num_iter, 3) accuracy += (node.file_path in map(itemgetter(1), neighbors)) count[num_iter - 1] = accuracy / NUM_NODES self.assertTrue(np.all(count >= 0.8)) def test_search_quarter(self): nsw = self._create_index(num_neighbors=(NUM_NODES / 4)) # count top3 accuracy depending on number of iterations NUM_ITERS = 3 count = np.zeros(NUM_ITERS) for num_iter in range(1, NUM_ITERS + 1): accuracy = 0.0 for node in self.nodes: neighbors = nsw.nn_search(node, num_iter, 3) accuracy += (node.file_path in map(itemgetter(1), neighbors)) count[num_iter - 1] = accuracy / NUM_NODES self.assertTrue(np.all(count >= np.array([0.5, 0.7, 0.9])))
def create_index(index_path): index = PyNSW('l2') if os.path.exists(index_path): index.load(index_path) else: with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = json.load(fp) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for idx, node in enumerate(nodes): if idx % 500 == 0: print('{} nodes inserted'.format(idx)) index.nn_insert(node, 3, 1000) index.save(index_path)
def test_init_list(self): node = create_node('kek', [1, 2, 3]) self.assertEqual([1., 2., 3.], node.coord) self.assertEqual('kek', node.file_path)
def test_set_new_values(self): node = create_node('kek', [1, 2, 3]) new_file_path = 'lol' node.file_path = new_file_path self.assertEqual(new_file_path, node.file_path)
def test_init_numpy(self): node = create_node('kek', np.array([1, 2, 3])) self.assertEqual([1., 2., 3.], node.coord) self.assertEqual('kek', node.file_path)
from tqdm import tqdm from python.index import create_node, PyNSW PATHS_JSON = getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths.json'))) EMBEDDING_JSON = getenv( 'EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings.json'))) if __name__ == '__main__': with open(PATHS_JSON) as fp: paths = json.load(fp) with open(EMBEDDING_JSON) as fp: embeddings = json.load(fp) nodes = [ create_node(path, vector) for path, vector in zip(paths, embeddings) ] nsw = PyNSW('l2') for node in tqdm(nodes): nsw.nn_insert(node, 1, 100) random_vector = embeddings[100] print(paths[100]) print(random_vector) neighbors = nsw.nn_search(create_node('kek', random_vector), 5, 3) print(neighbors)
from python.index import create_node, PyNSW from tqdm import tqdm NSW_INDEX_FILENAME = os.getenv( 'NSW_INDEX_FILENAME', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw'))) PATHS_JSON = os.getenv( 'PATHS_JSON', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'paths_celeba.json'))) EMBEDDING_JSON = os.getenv( 'EMBEDDING_JSON', os.path.abspath( os.path.join(__file__, '..', '..', 'data', 'embeddings_celeba.json'))) index = PyNSW('l2') with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = json.load(fp) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for node in tqdm(nodes): index.nn_insert(node, 3, 10) index.save(NSW_INDEX_FILENAME)
def setUpClass(cls): cls.nodes = [ create_node(str('node_{}'.format(i)), [i**2]) for i in range(NUM_NODES) ]