def test_celeba_embedding(self):
        PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json')))

        EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))


        INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann')))

        NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

        TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME',
            os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json')))

        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = np.array(json.load(fp))
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)

        with open(TEST_CASES_FILENAME, 'r') as fp:
            print('Loading test_cases')
            test_cases = json.load(fp)


        annoy = AnnoyIndex(len(embeddings[0]))    
        annoy_index = annoy.load(INDEX_FILENAME)

        print('building nsw index')
        nsw_index = PyNSW('l2')
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for node in tqdm(nodes):
            nsw_index.nn_insert(node, 5, 1000)

        n, k_annoy, k_nsw = 0, 0, 0

        print('Calculating accuracy on CelebA')

        for tk in test_cases:
            vector = embeddings[int(tk['embedding_index'])]
            
            closest_paths_real = tk['closest_paths_real']

            closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)]

            closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)]

            assert len(closest_paths_real) == 10
            assert len(closest_paths_annoy) == 10
            assert len(closest_paths_nsw) == 10

            n += 10
            k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real))
            k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real))


        print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n))
        print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
Esempio n. 2
0
class PyDistanceTests(unittest.TestCase):
    node1 = create_node('kek', [1, 2, 3])
    node2 = create_node('lol', [1, 2, 5])

    def test_l1(self):
        self.assertEqual(2, PyDistance_l1()(self.node1, self.node2))

    def test_l2(self):
        self.assertEqual(2, PyDistance_l2()(self.node1, self.node2))
    def get_closest_vector():
        content = request.get_json(silent=True)
        vector = content['vector']

        closest_celeb_filename = index.nn_search(create_node('1', vector), 1, 1)[0][1]
        response = {'closest_celeb_filename': closest_celeb_filename}
        response_pickled = jsonpickle.encode(response)
        return Response(response=response_pickled, status=200, mimetype="application/json")
Esempio n. 4
0
class PyNSWTests(unittest.TestCase):
    nodes = [create_node(str(i), [i**2]) for i in range(NUM_NODES)]

    def _create_index(self, num_neighbors=(NUM_NODES - 1), num_iters=1):
        nsw = PyNSW('l2')
        # connect each node to `num_neighbors` other nodes
        for node in self.nodes:
            nsw.nn_insert(node, num_iters, num_neighbors)
        return nsw

    def test_search_full(self):
        nsw = self._create_index()
        # check that the node is closest to itself
        for node in self.nodes:
            neighbors = nsw.nn_search(node, 1, 1)
            self.assertEqual(node.file_path, neighbors[0][1])

    def test_search_half(self):
        nsw = self._create_index(num_neighbors=(NUM_NODES / 2))

        # count top3 accuracy depending on number of iterations
        NUM_ITERS = 3
        count = np.zeros(NUM_ITERS)

        for num_iter in range(1, NUM_ITERS + 1):
            accuracy = 0.0
            for node in self.nodes:
                neighbors = nsw.nn_search(node, num_iter, 3)
                accuracy += (node.file_path in map(itemgetter(1), neighbors))
            count[num_iter - 1] = accuracy / NUM_NODES

        self.assertTrue(np.all(count >= 0.8))

    def test_search_quarter(self):
        nsw = self._create_index(num_neighbors=(NUM_NODES / 4))

        # count top3 accuracy depending on number of iterations
        NUM_ITERS = 3
        count = np.zeros(NUM_ITERS)

        for num_iter in range(1, NUM_ITERS + 1):
            accuracy = 0.0
            for node in self.nodes:
                neighbors = nsw.nn_search(node, num_iter, 3)
                accuracy += (node.file_path in map(itemgetter(1), neighbors))
            count[num_iter - 1] = accuracy / NUM_NODES

        self.assertTrue(np.all(count >= np.array([0.5, 0.7, 0.9])))
def create_index(index_path):
    index = PyNSW('l2')
    if os.path.exists(index_path):
        index.load(index_path)
    else:
        with open(PATHS_JSON, 'r') as fp:
            print('Loading paths')
            paths = json.load(fp)
        with open(EMBEDDING_JSON, 'r') as fp:
            print('Loading embeddings')
            embeddings = json.load(fp)
        print('Creating nodes')
        nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
        print('Inserting nodes')
        for idx, node in enumerate(nodes):
            if idx % 500 == 0:
                print('{} nodes inserted'.format(idx))
            index.nn_insert(node, 3, 1000)
        index.save(index_path)
Esempio n. 6
0
 def test_init_list(self):
     node = create_node('kek', [1, 2, 3])
     self.assertEqual([1., 2., 3.], node.coord)
     self.assertEqual('kek', node.file_path)
Esempio n. 7
0
    def test_set_new_values(self):
        node = create_node('kek', [1, 2, 3])
        new_file_path = 'lol'

        node.file_path = new_file_path
        self.assertEqual(new_file_path, node.file_path)
Esempio n. 8
0
 def test_init_numpy(self):
     node = create_node('kek', np.array([1, 2, 3]))
     self.assertEqual([1., 2., 3.], node.coord)
     self.assertEqual('kek', node.file_path)
Esempio n. 9
0
from tqdm import tqdm

from python.index import create_node, PyNSW

PATHS_JSON = getenv('PATHS_JSON',
                    abspath(join(__file__, '..', '..', 'data', 'paths.json')))
EMBEDDING_JSON = getenv(
    'EMBEDDING_JSON',
    abspath(join(__file__, '..', '..', 'data', 'embeddings.json')))

if __name__ == '__main__':
    with open(PATHS_JSON) as fp:
        paths = json.load(fp)
    with open(EMBEDDING_JSON) as fp:
        embeddings = json.load(fp)

    nodes = [
        create_node(path, vector) for path, vector in zip(paths, embeddings)
    ]

    nsw = PyNSW('l2')
    for node in tqdm(nodes):
        nsw.nn_insert(node, 1, 100)

    random_vector = embeddings[100]
    print(paths[100])
    print(random_vector)

    neighbors = nsw.nn_search(create_node('kek', random_vector), 5, 3)
    print(neighbors)
Esempio n. 10
0
from python.index import create_node, PyNSW
from tqdm import tqdm

NSW_INDEX_FILENAME = os.getenv(
    'NSW_INDEX_FILENAME',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw')))

PATHS_JSON = os.getenv(
    'PATHS_JSON',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'paths_celeba.json')))
EMBEDDING_JSON = os.getenv(
    'EMBEDDING_JSON',
    os.path.abspath(
        os.path.join(__file__, '..', '..', 'data', 'embeddings_celeba.json')))

index = PyNSW('l2')

with open(PATHS_JSON, 'r') as fp:
    print('Loading paths')
    paths = json.load(fp)
with open(EMBEDDING_JSON, 'r') as fp:
    print('Loading embeddings')
    embeddings = json.load(fp)
print('Creating nodes')
nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)]
print('Inserting nodes')
for node in tqdm(nodes):
    index.nn_insert(node, 3, 10)
index.save(NSW_INDEX_FILENAME)
Esempio n. 11
0
 def setUpClass(cls):
     cls.nodes = [
         create_node(str('node_{}'.format(i)), [i**2])
         for i in range(NUM_NODES)
     ]