def test_classify_missing_label(self): ccol = ClassifierCollection({ 'subjectA': DummyClassifier(), 'subjectB': DummyClassifier(), }) d_v = [0, 1, 2, 3, 4] d = DescriptorMemoryElement('memory', '0') d.set_vector(d_v) # Should throw a MissingLabelError with self.assertRaises(MissingLabelError) as cm: ccol.classify(d, labels=['subjectC']) self.assertSetEqual(cm.exception.labels, {'subjectC'}) # Should throw a MissingLabelError with self.assertRaises(MissingLabelError) as cm: ccol.classify(d, labels=['subjectA', 'subjectC']) self.assertSetEqual(cm.exception.labels, {'subjectC'}) # Should throw a MissingLabelError with self.assertRaises(MissingLabelError) as cm: ccol.classify(d, labels=['subjectC', 'subjectD']) self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'}) # Should throw a MissingLabelError with self.assertRaises(MissingLabelError) as cm: ccol.classify(d, labels=['subjectA', 'subjectC', 'subjectD']) self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'})
def test_build_index_with_cache(self) -> None: # Empty memory data elements for storage empty_data = 'base64://' f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data) # Internal elements should initialize have zero-length byte values assert f._index_elem is not None assert f._index_param_elem is not None assert f._descr_cache_elem is not None self.assertEqual(len(f._index_elem.get_bytes()), 0) self.assertEqual(len(f._index_param_elem.get_bytes()), 0) self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0) # Make unit vectors, one for each feature dimension. dim = 8 test_descriptors = [] for i in range(dim): v = numpy.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) f.build_index(test_descriptors) # Internal elements should not have non-zero byte values. self.assertGreater(len(f._index_elem.get_bytes()), 0) self.assertGreater(len(f._index_param_elem.get_bytes()), 0) self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
def test_classify(self): """ Test invoking `classify` in a valid manner. """ ccol = ClassifierCollection({ 'subjectA': DummyClassifier(), 'subjectB': DummyClassifier(), }) d_v = [0, 1, 2, 3, 4] d = DescriptorMemoryElement('memory', '0') d.set_vector(d_v) result = ccol.classify(d) # Should contain one entry for each configured classifier. self.assertEqual(len(result), 2) self.assertIn('subjectA', result) self.assertIn('subjectB', result) # Each key should map to a classification element (memory in this case # because we're using the default factory) self.assertIsInstance(result['subjectA'], MemoryClassificationElement) self.assertIsInstance(result['subjectB'], MemoryClassificationElement) # We know the dummy classifier outputs "classifications" in a # deterministic way: class label is "test" and classification # value is the index of the descriptor . self.assertDictEqual(result['subjectA'].get_classification(), {'test': 0}) self.assertDictEqual(result['subjectB'].get_classification(), {'test': 0})
def test_remove_from_index(self) -> None: # Test that removing by UIDs does the correct thing. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement(0), DescriptorMemoryElement(1), DescriptorMemoryElement(2), DescriptorMemoryElement(3), DescriptorMemoryElement(4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) d_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Attempt removing 1 uid. idx.remove_from_index([3]) assert isinstance(idx.descriptor_set, MemoryDescriptorSet) self.assertEqual(idx.descriptor_set._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 4: descriptors[4], }) assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore) self.assertEqual(idx.hash2uuids_kvstore._table, { 0: {0}, 1: {1}, 2: {2}, 4: {4}, })
def create_de(v: np.ndarray) -> DescriptorElement: nonlocal i # Hopefully type_str doesn't matter de = DescriptorMemoryElement(i) de.set_vector(v) i += 1 return de
def test_update_index_additive(self) -> None: n1 = 100 n2 = 10 dim = 8 set1 = {DescriptorMemoryElement('test', i) for i in range(n1)} set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)} [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)] # Create and build initial index. index = self._make_inst() index.build_index(set1) self.assertEqual(index.count(), len(set1)) for d in set1: self.assertIn(d, index._descriptor_set) # Update and check that all intended descriptors are present in index. index.update_index(set2) set_all = set1 | set2 self.assertEqual(index.count(), len(set_all)) for d in set_all: self.assertIn(d, index._descriptor_set) # Check that NN can return something from the updated set. # - nearest element to the query element when the query is in the index # should be the query element. for q in set2: n_elems, n_dists = index.nn(q) self.assertEqual(n_elems[0], q)
def test_classify_subset(self): ccol = ClassifierCollection({ 'subjectA': DummyClassifier(), 'subjectB': DummyClassifier(), }) classifierB = ccol._label_to_classifier['subjectB'] classifierB.classify_one_element = mock.Mock() d_v = [0, 1, 2, 3, 4] d = DescriptorMemoryElement('memory', '0') d.set_vector(d_v) result = ccol.classify(d, labels=['subjectA']) # Should contain one entry for each requested classifier. self.assertEqual(len(result), 1) self.assertIn('subjectA', result) self.assertNotIn('subjectB', result) classifierB.classify_one_element.assert_not_called() # Each key should map to a classification element (memory in this case # because we're using the default factory) self.assertIsInstance(result['subjectA'], MemoryClassificationElement) # We know the dummy classifier outputs "classifications" in a # deterministic way: class label is descriptor UUID and classification # value is its vector as a list. self.assertDictEqual(result['subjectA'].get_classification(), {'test': 0})
def test_update_index_no_existing_index(self) -> None: # Test that calling update_index with no existing index acts like # building the index fresh. This test is basically the same as # test_build_index_fresh_build but using update_index instead. descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) index.update_index(descriptors) # Make sure descriptors are now in attached index and in key-value-store self.assertEqual(descr_set.count(), 5) for d in descriptors: self.assertIn(d, descr_set) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_nn_small_leaves(self) -> None: np.random.seed(0) n = 10**4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_set = [DescriptorMemoryElement('test', i) for i in range(n)] [d.set_vector(np.random.rand(dim)) for d in d_set] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim, ))) di = MemoryDescriptorSet() mrpt = MRPTNearestNeighborsIndex(di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_set) nbrs, dists = mrpt.nn(q, k) self.assertEqual(len(nbrs), len(dists)) self.assertEqual(len(nbrs), k)
def test_count_empty_hash2uid(self) -> None: """ Test that an empty hash-to-uid mapping results in a 0 return regardless of descriptor-set state. """ descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() self.assertEqual(descr_set.count(), 0) self.assertEqual(hash_kvs.count(), 0) lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) self.assertEqual(lsh.count(), 0) # Additions to the descriptor-set should not impact LSH index "size" lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 0)) self.assertEqual(lsh.descriptor_set.count(), 1) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 1)) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.hash2uuids_kvstore.add(0, {0}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 1) lsh.hash2uuids_kvstore.add(0, {0, 1}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 2) lsh.hash2uuids_kvstore.add(0, {0, 1, 2}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 3)
def test_nn_known_descriptors_euclidean_ordered(self) -> None: index = self._make_inst() # make vectors to return in a known euclidean distance order i = 100 test_descriptors = [ DescriptorMemoryElement(j).set_vector( np.array([j, j * 2], float) ) for j in range(i) ] random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were built in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index # order. q = DescriptorMemoryElement(99) q.set_vector(np.array([0, 0], float)) r, dists = index.nn(q, n=i) self.assertEqual(len(dists), i) for j, d, dist in zip(range(i), r, dists): self.assertEqual(d.uuid(), j) np.testing.assert_equal(d.vector(), [j, j*2])
def test_fit_with_cache(self) -> None: fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement('test', i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(), bit_length=1, random_seed=0) assert itq.mean_vec_cache_elem is not None assert itq.rotation_cache_elem is not None itq.fit(fit_descriptors) # TODO: Explanation as to why this is the expected result. numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0]) numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)], [1 / sqrt(2)]]) self.assertIsNotNone(itq.mean_vec_cache_elem) # noinspection PyTypeChecker numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0]) self.assertIsNotNone(itq.rotation_cache_elem) # noinspection PyTypeChecker numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())), [[1 / sqrt(2)], [1 / sqrt(2)]])
def test_get_hash(self) -> None: fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement('test', i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) # The following "rotation" matrix should cause any 2-feature descriptor # to the right of the line ``y = -x`` to be True, and to the left as # False. If on the line, should be True. itq = ItqFunctor(bit_length=1, random_seed=0) itq.mean_vec = numpy.array([0., 0.]) itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])), [False]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1.001, 1])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, 1.001])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, -1.001])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1.001, -1])), [True])
def test_fit_short_descriptors_for_bit_length(self) -> None: # Should error when input descriptors have fewer dimensions than set bit # length for output hash codes (limitation of PCA method currently # used). fit_descriptors = [] for i in range(3): d = DescriptorMemoryElement('test', i) d.set_vector([-1 + i, -1 + i]) fit_descriptors.append(d) itq = ItqFunctor(bit_length=8) self.assertRaisesRegex( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, fit_descriptors) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation) # Should behave the same when input is an iterable self.assertRaisesRegex( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, iter(fit_descriptors)) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation)
def test_nn_pathological_example(self) -> None: n = 10**4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_set = [DescriptorMemoryElement('test', i) for i in range(n)] # Put all descriptors on a line so that different trees get same # divisions. # noinspection PyTypeChecker [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_set] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim, ))) di = MemoryDescriptorSet() mrpt = MRPTNearestNeighborsIndex(di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_set) nbrs, dists = mrpt.nn(q, k) self.assertEqual(len(nbrs), len(dists)) # We should get about 10 descriptors back instead of the requested # 200 self.assertLess(len(nbrs), 20)
def test_nn_known_descriptors_euclidean_ordered(self) -> None: index = self._make_inst() # make vectors to return in a known euclidean distance order i = 100 test_descriptors = [ DescriptorMemoryElement('ordered', j).set_vector(np.array([j, j * 2], float)) for j in range(i) ] random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index # order. q = DescriptorMemoryElement('query', 99) q.set_vector(np.array([0, 0], float)) r, dists = index.nn(q, n=i) # Because the data is one-dimensional, all of the cells will have # the same points (any division will just correspond to a point on # the line), and a cell can't have more than half of the points self.assertEqual(len(dists), i // 2) for j, d, dist in zip(range(i), r, dists): self.assertEqual(d.uuid(), j) np.testing.assert_equal(d.vector(), [j, j * 2])
def test_remove_then_add(self) -> None: """ Test that we can remove from the index and then add to it again. """ n1 = 100 n2 = 10 dim = 8 set1 = [DescriptorMemoryElement(i) for i in range(n1)] set2 = [DescriptorMemoryElement(i) for i in range(n1, n1 + n2)] [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)] uids_to_remove = [10, 98] index = self._make_inst() index.build_index(set1) index.remove_from_index(uids_to_remove) index.update_index(set2) self.assertEqual(len(index), 108) # Removed descriptors should not be in return queries. self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10]) self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98]) # Every other descriptor should be queryable for d in set1 + set2: if d.uuid() not in uids_to_remove: self.assertEqual(index.nn(d, 1)[0][0], d) self.assertEqual(index._next_index, 110)
def test_build_index_fresh_build(self) -> None: descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for i, d in enumerate(descriptors): d.set_vector(np.ones(1, float) * i) index.build_index(descriptors) # Make sure descriptors are now in attached index and in # key-value-store. self.assertEqual(descr_set.count(), 5) for d in descriptors: self.assertIn(d, descr_set) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_nn_known_descriptors_euclidean_unit(self) -> None: dim = 5 ### # Unit vectors -- Equal distance # index = self._make_inst() test_descriptors = [] for i in range(dim): v = np.zeros(dim, float) v[i] = 1. test_descriptors.append( DescriptorMemoryElement(i).set_vector(v) ) index.build_index(test_descriptors) # query descriptor -- zero vector # -> all modeled descriptors should be equally distant (unit # corners) q = DescriptorMemoryElement(0) q.set_vector(np.zeros(dim, float)) r, dists = index.nn(q, n=dim) self.assertEqual(len(dists), dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.)
def test_nn_known_descriptors_hik_unit(self) -> None: dim = 5 ### # Unit vectors - Equal distance # index = self._make_inst('hik') test_descriptors = [] for i in range(dim): v = numpy.zeros(dim, float) v[i] = 1. test_descriptors.append( DescriptorMemoryElement('unit', i).set_vector(v) ) index.build_index(test_descriptors) # query with zero vector # -> all modeled descriptors have no intersection, dists should be # 1.0, or maximum distance by histogram intersection. q = DescriptorMemoryElement('query', 0) q.set_vector(numpy.zeros(dim, float)) r, dists = index.nn(q, dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.) # query with index element q = test_descriptors[3] r, dists = index.nn(q, 1) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.) r, dists = index.nn(q, dim) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.)
def test_remove_from_index_invalid_uid(self) -> None: # Test that attempting to remove a single invalid UID causes a key # error and does not affect index. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement(0), DescriptorMemoryElement(1), DescriptorMemoryElement(2), DescriptorMemoryElement(3), DescriptorMemoryElement(4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) # uid -> descriptor expected_dset_table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } # hash int -> set[uid] expected_kvs_table = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, } d_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Assert we have the correct expected values assert isinstance(idx.descriptor_set, MemoryDescriptorSet) self.assertEqual(idx.descriptor_set._table, expected_dset_table) assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove descriptor with a UID we did not build with. self.assertRaisesRegex( KeyError, '5', idx.remove_from_index, [5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_set._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove multiple UIDs, one valid and one invalid self.assertRaisesRegex( KeyError, '5', idx.remove_from_index, [2, 5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_set._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
def test_build_index_one(self) -> None: d = DescriptorMemoryElement(0) d.set_vector(numpy.zeros(8, float)) index = self._make_inst('euclidean') index.build_index([d]) self.assertListEqual(index._descr_cache, [d]) self.assertIsNotNone(index._flann) self.assertIsInstance(index._flann_build_params, dict)
def test_read_only(self) -> None: v = np.zeros(5, float) v[0] = 1. d = DescriptorMemoryElement('unit', 0) d.set_vector(v) test_descriptors = [d] index = self._make_inst(read_only=True) self.assertRaises(ReadOnlyError, index.build_index, test_descriptors)
def test_classify_elements_missing_vector(self) -> None: """ Test that we get a ValueError when """ elems = [ DescriptorMemoryElement('', 0).set_vector([1, 2, 3]), DescriptorMemoryElement('', 0), # no set vector DescriptorMemoryElement('', 0).set_vector([4, 5, 6]), ] with pytest.raises(ValueError, match=r"no vector stored"): list(self.inst.classify_elements(elems))
def test_remove_from_index_shared_hashes_partial(self) -> None: """ Test that only some hashes are removed from the hash index, but not others when those hashes still refer to other descriptors. """ # Simulate initial state with some descriptor hashed to one value and # other descriptors hashed to another. # Vectors of length 1 for easy dummy hashing prediction. descriptors = [ DescriptorMemoryElement(0).set_vector([0]), DescriptorMemoryElement(1).set_vector([1]), DescriptorMemoryElement(2).set_vector([2]), DescriptorMemoryElement(3).set_vector([3]), DescriptorMemoryElement(4).set_vector([4]), ] # Dummy hash function to do the simulated thing hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock( # type: ignore # Vectors of even sum hash to 0, odd to 1. side_effect=lambda vec: [vec.sum() % 2] ) d_set = MemoryDescriptorSet() d_set._table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } hash2uid_kvs = MemoryKeyValueStore() hash2uid_kvs._table = { 0: {0, 2, 4}, 1: {1, 3}, } idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs) idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([1, 2, 3]) # Check that only one hash vector was passed to hash_index's removal # method (deque of hash-code vectors). idx.hash_index.remove_from_index.assert_called_once_with( collections.deque([ [1], ]) ) self.assertDictEqual(d_set._table, { 0: descriptors[0], 4: descriptors[4], }) self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
def test_remove_from_index_shared_hashes(self) -> None: """ Test that removing a descriptor (by UID) that shares a hash with other descriptors does not trigger removal of its hash. """ # Simulate descriptors all hashing to the same hash value: 0 hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock(return_value=np.asarray( [0], bool)) # type: ignore d_set = MemoryDescriptorSet() hash2uids_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs) # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) idx.build_index(descriptors) # We expect the descriptor-set and kvs to look like the following now: self.assertDictEqual( d_set._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}}) # Mock out hash index as if we had an implementation so we can check # call to its remove_from_index method. idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([2, 4]) # Only uid 2 and 4 descriptors should be gone from d-set, kvs should # still have the 0 key and its set value should only contain uids 0, 1 # and 3. `hash_index.remove_from_index` should not be called because # no hashes should be marked for removal. self.assertDictEqual(d_set._table, { 0: descriptors[0], 1: descriptors[1], 3: descriptors[3], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}}) idx.hash_index.remove_from_index.assert_not_called()
def test_nn_empty_index(self) -> None: # nn should fail if index size is 0 index = DummySI() # noinspection PyTypeHints index.count = mock.MagicMock(return_value=0) # type: ignore # noinspection PyTypeHints index._nn = mock.MagicMock() # type: ignore q = DescriptorMemoryElement('q', 0) q.set_vector(numpy.random.rand(4)) self.assertRaises(ValueError, index.nn, q)
def test_nn_normal_conditions(self) -> None: index = DummySI() # Need to force a non-zero index size for knn to be performed. # noinspection PyTypeHints index.count = mock.MagicMock() # type: ignore index.count.return_value = 1 q = DescriptorMemoryElement('q', 0) q.set_vector(numpy.random.rand(4)) # Basically this shouldn't crash index.nn(q)
def _known_unit( self, hash_ftor: LshFunctor, hash_idx: Optional[HashIndex], dist_method: str, ftor_train_hook: Callable[[Iterable[DescriptorElement]], None] = lambda d: None ) -> None: ### # Unit vectors - Equal distance # dim = 5 test_descriptors = [] for i in range(dim): v = np.zeros(dim, float) v[i] = 1. test_descriptors.append( DescriptorMemoryElement('unit', i).set_vector(v)) ftor_train_hook(test_descriptors) di = MemoryDescriptorSet() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method=dist_method) index.build_index(test_descriptors) # query with zero vector # -> all modeled descriptors have no intersection, dists should be 1.0, # or maximum distance by histogram intersection q = DescriptorMemoryElement('query', 0) q.set_vector(np.zeros(dim, float)) r, dists = index.nn(q, dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.) # query with index element q = test_descriptors[3] r, dists = index.nn(q, 1) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.) r, dists = index.nn(q, dim) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.)
def test_persistence_with_update_index(self) -> None: n1 = 100 n2 = 10 dim = 8 set1 = {DescriptorMemoryElement(i) for i in range(n1)} set2 = {DescriptorMemoryElement(i) for i in range(n1, n1+n2)} [d.set_vector(np.random.rand(dim)) for d in (set1 | set2)] # Create index with persistent entities index_element = DataMemoryElement( content_type='application/octet-stream') index_param_element = DataMemoryElement( content_type='text/plain') index = self._make_inst( index_element=index_element, index_param_element=index_param_element) descriptor_set = index._descriptor_set idx2uid_kvs = index._idx2uid_kvs uid2idx_kvs = index._uid2idx_kvs # Build initial index. index.build_index(set1) self.assertEqual(index.count(), len(set1)) for d in set1: self.assertIn(d, index._descriptor_set) # Update and check that all intended descriptors are present in # index. index.update_index(set2) set_all = set1 | set2 self.assertEqual(index.count(), len(set_all)) for d in set_all: self.assertIn(d, index._descriptor_set) del index index = self._make_inst( descriptor_set=descriptor_set, idx2uid_kvs=idx2uid_kvs, uid2idx_kvs=uid2idx_kvs, index_element=index_element, index_param_element=index_param_element) # Check that NN can return something from the updated set. # - nearest element to the query element when the query is in the # index should be the query element. for q in set_all: n_elems, n_dists = index.nn(q) self.assertEqual(n_elems[0], q)