Example #1
0
    def test_update_index_with_hash_index(self):
        # Similar test to `test_update_index_add_new_descriptors` but with a
        # linear hash index.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs, linear_hi)

        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        # Initial hash index should only encode hashes for first batch of
        # descriptors.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        # Now the hash index should include all descriptor hashes.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4, 5, 6})
Example #2
0
    def test_configuration(self):
        c = LSHNearestNeighborIndex.get_default_config()

        # Check that default is in JSON format and is decoded to the same
        # result.
        self.assertEqual(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        # - ItqFunctor should always be available since it has no dependencies.
        c['lsh_functor']['type'] = 'ItqFunctor'
        c['descriptor_index']['type'] = 'MemoryDescriptorIndex'
        c['hash2uuids_kvstore']['type'] = 'MemoryKeyValueStore'
        c['hash_index']['type'] = 'LinearHashIndex'
        index = LSHNearestNeighborIndex.from_config(c)

        self.assertIsInstance(index.lsh_functor, ItqFunctor)
        self.assertIsInstance(index.descriptor_index, MemoryDescriptorIndex)
        self.assertIsInstance(index.hash_index, LinearHashIndex)
        self.assertIsInstance(index.hash2uuids_kvstore, MemoryKeyValueStore)

        # Can convert instance config to JSON
        self.assertEqual(
            json.loads(json.dumps(index.get_config())),
            index.get_config()
        )
Example #3
0
    def test_update_index_no_existing_index(self):
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #4
0
    def test_remove_from_index(self):
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        self.assertEqual(idx.descriptor_index._table, {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            4: descriptors[4],
        })
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
Example #5
0
    def test_build_index_fresh_build(self):
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #6
0
    def test_update_index_no_existing_index(self):
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #7
0
    def test_build_index_fresh_build(self):
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in
        # key-value-store.
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #8
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
Example #9
0
    def test_configuration_none_HI(self):
        c = LSHNearestNeighborIndex.get_default_config()

        # Check that default is in JSON format and is decoded to the same
        # result.
        self.assertEqual(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        c['lsh_functor'][
            'type'] = 'smqtk.algorithms.nn_index.lsh.functors.itq.ItqFunctor'
        c['descriptor_set'][
            'type'] = 'smqtk.representation.descriptor_set.memory.MemoryDescriptorSet'
        c['hash2uuids_kvstore'][
            'type'] = 'smqtk.representation.key_value.memory.MemoryKeyValueStore'
        c['hash_index']['type'] = None
        index = LSHNearestNeighborIndex.from_config(c)

        self.assertIsInstance(index.lsh_functor, ItqFunctor)
        self.assertIsInstance(index.descriptor_set, MemoryDescriptorSet)
        self.assertIsNone(index.hash_index)
        self.assertIsInstance(index.hash2uuids_kvstore, MemoryKeyValueStore)

        # Can convert instance config to JSON
        self.assertEqual(json.loads(json.dumps(index.get_config())),
                         index.get_config())
Example #10
0
    def test_configuration(self):
        c = LSHNearestNeighborIndex.get_default_config()

        # Check that default is in JSON format and is decoded to the same
        # result.
        self.assertEqual(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        # - ItqFunctor should always be available since it has no dependencies.
        c['lsh_functor']['type'] = 'ItqFunctor'
        c['descriptor_index']['type'] = 'MemoryDescriptorIndex'
        c['hash2uuids_kvstore']['type'] = 'MemoryKeyValueStore'
        c['hash_index']['type'] = 'LinearHashIndex'
        index = LSHNearestNeighborIndex.from_config(c)

        self.assertIsInstance(index.lsh_functor, ItqFunctor)
        self.assertIsInstance(index.descriptor_index, MemoryDescriptorIndex)
        self.assertIsInstance(index.hash_index, LinearHashIndex)
        self.assertIsInstance(index.hash2uuids_kvstore, MemoryKeyValueStore)

        # Can convert instance config to JSON
        self.assertEqual(
            json.loads(json.dumps(index.get_config())),
            index.get_config()
        )
Example #11
0
    def test_remove_from_index_invalid_uid(self):
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
Example #12
0
    def test_remove_from_index_invalid_uid(self):
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
Example #13
0
    def test_remove_from_index_shared_hashes_partial(self):
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorIndex()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
Example #14
0
    def test_remove_from_index_shared_hashes_partial(self):
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorIndex()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
Example #15
0
    def test_count_empty_hash2uid(self):
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_set.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 3)
Example #16
0
    def _random_euclidean(self,
                          hash_ftor,
                          hash_idx,
                          ftor_train_hook=lambda d: None):
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        numpy.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i + 1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
Example #17
0
    def test_update_index_existing_descriptors_frozenset(self):
        """
        Same as ``test_update_index_similar_descriptors`` but testing that
        we can update the index when seeded with structures with existing
        values.
        """
        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        descr_index = MemoryDescriptorIndex()
        descr_index.add_many_descriptors(descriptors1)

        hash_kvs = MemoryKeyValueStore()
        hash_kvs.add(0, frozenset({0}))
        hash_kvs.add(1, frozenset({1}))
        hash_kvs.add(2, frozenset({2}))
        hash_kvs.add(3, frozenset({3}))
        hash_kvs.add(4, frozenset({4}))

        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)
        index.update_index(descriptors2)

        assert descr_index.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Example #18
0
    def test_update_index_existing_descriptors_frozenset(self):
        """
        Same as ``test_update_index_similar_descriptors`` but testing that
        we can update the index when seeded with structures with existing
        values.
        """
        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        descr_set = MemoryDescriptorSet()
        descr_set.add_many_descriptors(descriptors1)

        hash_kvs = MemoryKeyValueStore()
        hash_kvs.add(0, frozenset({0}))
        hash_kvs.add(1, frozenset({1}))
        hash_kvs.add(2, frozenset({2}))
        hash_kvs.add(3, frozenset({3}))
        hash_kvs.add(4, frozenset({4}))

        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)
        index.update_index(descriptors2)

        assert descr_set.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Example #19
0
 def test_get_dist_func_euclidean(self):
     f = LSHNearestNeighborIndex._get_dist_func('euclidean')
     self.assertIsInstance(f, types.FunctionType)
     self.assertAlmostEqual(
         f(np.array([0, 0]), np.array([0, 1])),
         1.0
     )
Example #20
0
    def nearestNeighborIndex(item, user, descriptorSet):
        """
        Get the nearest neighbor index from a given item and descriptor set.

        :param item: Item to find the nn index from, usually the item that the
            user is performing the nearest neighbors search on.
        :param user: The owner of the .smqtk folder.
        :param descriptorSet: The relevant descriptor set.
        """
        folder = ModelImporter.model('folder')

        _GirderDataElement = functools.partial(GirderDataElement,
                                               api_root=getApiUrl(),
                                               token=getCurrentToken()['_id'])

        smqtkFolder = folder.createFolder(folder.load(item['folderId'], user=user), '.smqtk',
                                          reuseExisting=True)

        try:
            meanVecFileId = localSmqtkFileIdFromName(smqtkFolder, 'mean_vec.npy')
            rotationFileId = localSmqtkFileIdFromName(smqtkFolder, 'rotation.npy')
            hash2uuidsFileId = localSmqtkFileIdFromName(smqtkFolder, 'hash2uuids.pickle')
        except Exception:
            logger.warn('SMQTK files didn\'t exist for performing NN on %s' % item['_id'])
            return None

        # TODO Should these be Girder data elements? Unnecessary HTTP requests.
        functor = ItqFunctor(mean_vec_cache=_GirderDataElement(meanVecFileId),
                             rotation_cache=_GirderDataElement(rotationFileId))

        hash2uuidsKV = MemoryKeyValueStore(_GirderDataElement(hash2uuidsFileId))

        return LSHNearestNeighborIndex(functor, descriptorSet,
                                       hash2uuidsKV, read_only=True)
Example #21
0
 def test_get_dist_func_euclidean(self):
     f = LSHNearestNeighborIndex._get_dist_func('euclidean')
     self.assertIsInstance(f, types.FunctionType)
     self.assertAlmostEqual(
         f(np.array([0, 0]), np.array([0, 1])),
         1.0
     )
Example #22
0
    def _nearestNeighborIndex(sid, descriptor_set):
        """
        Retrieve the Nearest neighbor index for a given session.

        :param sid: ID of the session
        :param descriptor_set: The descriptor set corresponding to the session id,
        see _descriptorSetFromSessionId.
        :returns: Nearest neighbor index or None if no session exists
        :rtype: LSHNearestNeighborIndex|None
        """
        session = ModelImporter.model('item').findOne({'_id': ObjectId(sid)})

        if not session:
            return None
        else:
            smqtkFolder = {'_id': ObjectId(session['meta']['smqtk_folder_id'])}

            functor = ItqFunctor(
                smqtkDataElementFromGirderFileId(
                    localSmqtkFileIdFromName(smqtkFolder, 'mean_vec.npy')),
                smqtkDataElementFromGirderFileId(
                    localSmqtkFileIdFromName(smqtkFolder, 'rotation.npy')))
            hash2uuidsKV = MemoryKeyValueStore(
                smqtkDataElementFromGirderFileId(
                    localSmqtkFileIdFromName(smqtkFolder,
                                             'hash2uuids.pickle')))

            return LSHNearestNeighborIndex(functor,
                                           descriptor_set,
                                           hash2uuidsKV,
                                           read_only=True)
Example #23
0
 def test_get_dist_func_cosine(self):
     f = LSHNearestNeighborIndex._get_dist_func('cosine')
     ntools.assert_is_instance(f, types.FunctionType)
     ntools.assert_almost_equal(f(numpy.array([1, 0]), numpy.array([0, 1])),
                                1.0)
     ntools.assert_almost_equal(f(numpy.array([1, 0]), numpy.array([1, 1])),
                                0.5)
Example #24
0
 def test_update_index_read_only(self):
     index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                     MemoryDescriptorIndex(),
                                     MemoryKeyValueStore(), read_only=True)
     self.assertRaises(
         ReadOnlyError,
         index._update_index, []
     )
Example #25
0
    def test_configuration(self):
        c = LSHNearestNeighborIndex.get_default_config()

        ntools.assert_equal(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        c["lsh_functor"]["type"] = "ItqFunctor"
        c["descriptor_index"]["type"] = "MemoryDescriptorIndex"
        c["hash_index"]["type"] = "LinearHashIndex"
        index = LSHNearestNeighborIndex.from_config(c)

        ntools.assert_is_instance(index.lsh_functor, ItqFunctor)
        ntools.assert_is_instance(index.descriptor_index, MemoryDescriptorIndex)
        ntools.assert_is_instance(index.hash_index, LinearHashIndex)

        # Can convert instance config to JSON
        ntools.assert_equal(json.loads(json.dumps(index.get_config())), index.get_config())
Example #26
0
    def test_count_empty_hash2uid(self):
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_index.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 3)
Example #27
0
 def test_remove_from_index_no_existing_index(self):
     # Test that attempting to remove from an instance with no existing
     # index (meaning empty descriptor-set and key-value-store) results in
     # a key error.
     d_set = MemoryDescriptorSet()
     hash_kvs = MemoryKeyValueStore()
     idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
     self.assertRaisesRegex(KeyError, 'uid1', idx.remove_from_index,
                            ['uid1'])
Example #28
0
 def test_remove_from_index_read_only(self):
     d_set = MemoryDescriptorSet()
     hash_kvs = MemoryKeyValueStore()
     idx = LSHNearestNeighborIndex(DummyHashFunctor(),
                                   d_set,
                                   hash_kvs,
                                   read_only=True)
     self.assertRaises(ReadOnlyError, idx.remove_from_index,
                       ['uid1', 'uid2'])
Example #29
0
    def test_update_index_similar_descriptors(self):
        """
        Test that updating a built index with similar descriptors (same
        vectors, different UUIDs) results in contained structures having an
        expected state.
        """
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_index.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Example #30
0
    def test_update_index_add_new_descriptors(self):
        # Test that calling update index after a build index causes index
        # components to be properly updated.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)
        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors1:
            self.assertIn(d, descr_index)
        for d in descriptors2:
            self.assertNotIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        self.assertEqual(descr_index.count(), 7)
        for d in descriptors1 + descriptors2:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 7)
        for i in range(7):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #31
0
    def test_configuration(self):
        c = LSHNearestNeighborIndex.get_default_config()

        ntools.assert_equal(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        c['lsh_functor']['type'] = 'ItqFunctor'
        c['descriptor_index']['type'] = 'MemoryDescriptorIndex'
        c['hash_index']['type'] = 'LinearHashIndex'
        index = LSHNearestNeighborIndex.from_config(c)

        ntools.assert_is_instance(index.lsh_functor, ItqFunctor)
        ntools.assert_is_instance(index.descriptor_index,
                                  MemoryDescriptorIndex)
        ntools.assert_is_instance(index.hash_index, LinearHashIndex)

        # Can convert instance config to JSON
        ntools.assert_equal(json.loads(json.dumps(index.get_config())),
                            index.get_config())
Example #32
0
    def test_update_index_duplicate_descriptors(self):
        """
        Test that updating a built index with the same descriptors results in
        idempotent behavior.
        """
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        # Identical Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_index.count() == 5
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0}
        assert hash_kvs.get(1) == {1}
        assert hash_kvs.get(2) == {2}
        assert hash_kvs.get(3) == {3}
        assert hash_kvs.get(4) == {4}
Example #33
0
 def test_get_dist_func_cosine(self):
     f = LSHNearestNeighborIndex._get_dist_func('cosine')
     self.assertIsInstance(f, types.FunctionType)
     self.assertAlmostEqual(
         f(np.array([1, 0]), np.array([0, 1])),
         1.0
     )
     self.assertAlmostEqual(
         f(np.array([1, 0]), np.array([1, 1])),
         0.5
     )
Example #34
0
 def test_get_dist_func_cosine(self):
     f = LSHNearestNeighborIndex._get_dist_func('cosine')
     self.assertIsInstance(f, types.FunctionType)
     self.assertAlmostEqual(
         f(np.array([1, 0]), np.array([0, 1])),
         1.0
     )
     self.assertAlmostEqual(
         f(np.array([1, 0]), np.array([1, 1])),
         0.5
     )
Example #35
0
    def test_build_index_fresh_build_with_hash_index(self):
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs, linear_hi)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)
        # Hash index should have been built with hash vectors, and linearHI
        # converts those to integers for storage.
        self.assertEqual(linear_hi.index, {0, 1, 2, 3, 4})
Example #36
0
    def test_build_index_fresh_build_with_hash_index(self):
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)
        # Hash index should have been built with hash vectors, and linearHI
        # converts those to integers for storage.
        self.assertEqual(linear_hi.index, {0, 1, 2, 3, 4})
Example #37
0
    def test_remove_from_index(self):
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        self.assertEqual(
            idx.descriptor_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                4: descriptors[4],
            })
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
Example #38
0
    def test_update_index_with_hash_index(self):
        # Similar test to `test_update_index_add_new_descriptors` but with a
        # linear hash index.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        # Initial hash index should only encode hashes for first batch of
        # descriptors.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        # Now the hash index should include all descriptor hashes.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4, 5, 6})
Example #39
0
    def _random_euclidean(self, hash_ftor, hash_idx,
                          ftor_train_hook=lambda d: None):
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        np.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(np.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim-1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        self.assertFalse(np.array_equal(q.vector(), td_q.vector()))
        self.assertEqual(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i+1)
        q.set_vector(np.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
Example #40
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
Example #41
0
    def test_remove_from_index_shared_hashes(self):
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool))

        d_set = MemoryDescriptorSet()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(
            d_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                3: descriptors[3],
                4: descriptors[4],
            })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
Example #42
0
    def test_configuration_none_HI(self):
        c = LSHNearestNeighborIndex.get_default_config()

        # Check that default is in JSON format and is decoded to the same
        # result.
        self.assertEqual(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        c['lsh_functor']['type'] = 'ItqFunctor'
        c['descriptor_index']['type'] = 'MemoryDescriptorIndex'
        c['hash2uuids_kvstore']['type'] = 'MemoryKeyValueStore'
        c['hash_index']['type'] = None
        index = LSHNearestNeighborIndex.from_config(c)

        self.assertIsInstance(index.lsh_functor, ItqFunctor)
        self.assertIsInstance(index.descriptor_index, MemoryDescriptorIndex)
        self.assertIsNone(index.hash_index)
        self.assertIsInstance(index.hash2uuids_kvstore, MemoryKeyValueStore)

        # Can convert instance config to JSON
        self.assertEqual(
            json.loads(json.dumps(index.get_config())),
            index.get_config()
        )
Example #43
0
    def test_configuration_none_HI(self):
        c = LSHNearestNeighborIndex.get_default_config()

        # Check that default is in JSON format and is decoded to the same
        # result.
        ntools.assert_equal(json.loads(json.dumps(c)), c)

        # Make a simple configuration
        c['lsh_functor']['type'] = 'ItqFunctor'
        c['descriptor_index']['type'] = 'MemoryDescriptorIndex'
        c['hash2uuids_kvstore']['type'] = 'MemoryKeyValueStore'
        c['hash_index']['type'] = None
        index = LSHNearestNeighborIndex.from_config(c)

        ntools.assert_is_instance(index.lsh_functor, ItqFunctor)
        ntools.assert_is_instance(index.descriptor_index,
                                  MemoryDescriptorIndex)
        ntools.assert_is_none(index.hash_index)
        ntools.assert_is_instance(index.hash2uuids_kvstore,
                                  MemoryKeyValueStore)

        # Can convert instance config to JSON
        ntools.assert_equal(json.loads(json.dumps(index.get_config())),
                            index.get_config())
Example #44
0
 def test_configuration(self):
     i = LSHNearestNeighborIndex(lsh_functor=ItqFunctor(),
                                 descriptor_set=MemoryDescriptorSet(),
                                 hash2uuids_kvstore=MemoryKeyValueStore(),
                                 hash_index=LinearHashIndex(),
                                 distance_method='euclidean',
                                 read_only=True)
     for inst in configuration_test_helper(
             i):  # type: LSHNearestNeighborIndex
         assert isinstance(inst.lsh_functor, LshFunctor)
         assert isinstance(inst.descriptor_set, MemoryDescriptorSet)
         assert isinstance(inst.hash_index, LinearHashIndex)
         assert isinstance(inst.hash2uuids_kvstore, MemoryKeyValueStore)
         assert inst.distance_method == 'euclidean'
         assert inst.read_only is True
Example #45
0
    def test_remove_from_index_shared_hashes(self):
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool))

        d_set = MemoryDescriptorIndex()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
Example #46
0
    def _known_unit(self,
                    hash_ftor,
                    hash_idx,
                    dist_method,
                    ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            ntools.assert_equal(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.)

        r, dists = index.nn(q, dim)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.)
Example #47
0
    def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None):
        # make random descriptors
        i = 1000
        dim = 256
        td = []
        numpy.random.seed(self.RANDOM_SEED)
        for j in xrange(i):
            d = DescriptorMemoryElement("random", j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method="euclidean")
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement("query", i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement("query", i + 1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])

        DescriptorMemoryElement.MEMORY_CACHE = {}
Example #48
0
    def test_update_index_add_new_descriptors(self):
        # Test that calling update index after a build index causes index
        # components to be properly updated.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)
        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors1:
            self.assertIn(d, descr_set)
        for d in descriptors2:
            self.assertNotIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        self.assertEqual(descr_set.count(), 7)
        for d in descriptors1 + descriptors2:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 7)
        for i in range(7):
            self.assertSetEqual(hash_kvs.get(i), {i})
Example #49
0
    def test_update_index_similar_descriptors(self):
        """
        Test that updating a built index with similar descriptors (same
        vectors, different UUIDs) results in contained structures having an
        expected state.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_set.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Example #50
0
    def _known_unit(self, hash_ftor, hash_idx, dist_method,
                    ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Example #51
0
    def test_update_index_duplicate_descriptors(self):
        """
        Test that updating a built index with the same descriptors results in
        idempotent behavior.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        # Identical Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_set.count() == 5
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0}
        assert hash_kvs.get(1) == {1}
        assert hash_kvs.get(2) == {2}
        assert hash_kvs.get(3) == {3}
        assert hash_kvs.get(4) == {4}
Example #52
0
    def _known_unit(self, hash_ftor, hash_idx, dist_method, ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in xrange(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.0
            d = DescriptorMemoryElement("unit", i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement("query", 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            ntools.assert_equal(d, 1.0)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.0)

        r, dists = index.nn(q, dim)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.0)

        DescriptorMemoryElement.MEMORY_CACHE = {}
Example #53
0
 def test_is_usable(self):
     # Should always be usable since this is a shell class.
     self.assertTrue(LSHNearestNeighborIndex.is_usable())
Example #54
0
 def test_get_dist_func_euclidean(self):
     f = LSHNearestNeighborIndex._get_dist_func('euclidean')
     ntools.assert_is_instance(f, types.FunctionType)
     ntools.assert_almost_equal(f(numpy.array([0, 0]), numpy.array([0, 1])),
                                1.0)