def test_pickle_dump_load(self): # Wipe current cache DescriptorMemoryElement.MEMORY_CACHE = {} # Make a couple descriptors v1 = numpy.array([1, 2, 3]) d1 = DescriptorMemoryElement('test', 0) d1.set_vector(v1) v2 = numpy.array([4, 5, 6]) d2 = DescriptorMemoryElement('test', 1) d2.set_vector(v2) ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE) d1_s = cPickle.dumps(d1) d2_s = cPickle.dumps(d2) # Wipe cache again DescriptorMemoryElement.MEMORY_CACHE = {} ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE) # Attempt reconstitution d1_r = cPickle.loads(d1_s) d2_r = cPickle.loads(d2_s) numpy.testing.assert_array_equal(v1, d1_r.vector()) numpy.testing.assert_array_equal(v2, d2_r.vector()) # Cache should now have those entries back in it ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
def test_get_feedback(self): """ Test successfully getting feedback results from a requested session. """ # Mock controller interaction to get a mock IqrSession instance. self.app.controller.has_session_uuid = \ mock.MagicMock(return_value=True) self.app.controller.get_session = mock.MagicMock() # Mock IQR session instance to have # Mock feedback_results return to be something valid. d0 = DescriptorMemoryElement('', 0).set_vector([0]) d1 = DescriptorMemoryElement('', 1).set_vector([1]) d2 = DescriptorMemoryElement('', 2).set_vector([2]) self.app.controller.get_session().feedback_results.return_value = [ d0, d2, d1 ] test_sid = '0000' with self.app.test_client() as tc: r = tc.get('/get_feedback?sid={}'.format(test_sid)) self.assertStatusCode(r, 200) self.assertJsonMessageRegex(r, "Returning feedback uuids") r_json = r.json assert r_json['total_results'] == 3 assert r_json['results'] == [0, 2, 1] self.app.controller.has_session_uuid.assert_called_once_with(test_sid)
def test_remove_from_index(self): # Test that removing by UIDs does the correct thing. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) d_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Attempt removing 1 uid. idx.remove_from_index([3]) self.assertEqual( idx.descriptor_set._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 4: descriptors[4], }) self.assertEqual(idx.hash2uuids_kvstore._table, { 0: {0}, 1: {1}, 2: {2}, 4: {4}, })
def test_nn_pathological_example(self): n = 10**4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_set = [DescriptorMemoryElement('test', i) for i in range(n)] # Put all descriptors on a line so that different trees get same # divisions. # noinspection PyTypeChecker [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_set] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim, ))) di = MemoryDescriptorSet() mrpt = MRPTNearestNeighborsIndex(di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_set) nbrs, dists = mrpt.nn(q, k) self.assertEqual(len(nbrs), len(dists)) # We should get about 10 descriptors back instead of the requested # 200 self.assertLess(len(nbrs), 20)
def test_adjudicate_add_duplicates(self): """ Test that adding duplicate descriptors as positive or negative adjudications has no effect as the behavior of sets should be observed. """ iqrs = IqrSession() p0 = DescriptorMemoryElement('', 0).set_vector([0]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n1 = DescriptorMemoryElement('', 1).set_vector([1]) p3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Partially add the above descriptors iqrs.adjudicate(new_positives=[p0], new_negatives=[n1]) assert iqrs.positive_descriptors == {p0} assert iqrs.negative_descriptors == {n1} # Add all descriptors, observing that that already added descriptors # are ignored. iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4]) assert iqrs.positive_descriptors == {p0, p2, p3} assert iqrs.negative_descriptors == {n1, n4} # Duplicate previous call so no new descriptors are added. No change or # issue should be observed. iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4]) assert iqrs.positive_descriptors == {p0, p2, p3} assert iqrs.negative_descriptors == {n1, n4}
def test_adjudication_switch(self): """ Test providing positives and negatives on top of an existing state such that the descriptor adjudications are reversed. (what was once positive is now negative, etc.) """ iqrs = IqrSession() p0 = DescriptorMemoryElement('', 0).set_vector([0]) p1 = DescriptorMemoryElement('', 1).set_vector([1]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Set initial state iqrs.positive_descriptors = {p0, p1, p2} iqrs.negative_descriptors = {n3, n4} # Adjudicate, partially swapping adjudications individually iqrs.adjudicate(new_positives=[n3]) assert iqrs.positive_descriptors == {p0, p1, p2, n3} assert iqrs.negative_descriptors == {n4} iqrs.adjudicate(new_negatives=[p1]) assert iqrs.positive_descriptors == {p0, p2, n3} assert iqrs.negative_descriptors == {n4, p1} # Adjudicate swapping remaining at the same time iqrs.adjudicate(new_positives=[n4], new_negatives=[p0, p2]) assert iqrs.positive_descriptors == {n3, n4} assert iqrs.negative_descriptors == {p0, p1, p2}
def test_count_empty_hash2uid(self): """ Test that an empty hash-to-uid mapping results in a 0 return regardless of descriptor-set state. """ descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() self.assertEqual(descr_set.count(), 0) self.assertEqual(hash_kvs.count(), 0) lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) self.assertEqual(lsh.count(), 0) # Additions to the descriptor-set should not impact LSH index "size" lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 0)) self.assertEqual(lsh.descriptor_set.count(), 1) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 1)) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.hash2uuids_kvstore.add(0, {0}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 1) lsh.hash2uuids_kvstore.add(0, {0, 1}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 2) lsh.hash2uuids_kvstore.add(0, {0, 1, 2}) self.assertEqual(lsh.descriptor_set.count(), 2) self.assertEqual(lsh.count(), 3)
def test_remove_then_add(self): """ Test that we can remove from the index and then add to it again. """ n1 = 100 n2 = 10 dim = 8 set1 = [DescriptorMemoryElement('test', i) for i in range(n1)] set2 = [DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)] [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)] uids_to_remove = [10, 98] index = self._make_inst() index.build_index(set1) index.remove_from_index(uids_to_remove) index.update_index(set2) self.assertEqual(len(index), 108) # Removed descriptors should not be in return queries. self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10]) self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98]) # Every other descriptor should be queryable for d in set1 + set2: if d.uuid() not in uids_to_remove: self.assertEqual(index.nn(d, 1)[0][0], d) self.assertEqual(index._next_index, 110)
def test_nn_known_descriptors_euclidean_ordered(self): index = self._make_inst() # make vectors to return in a known euclidean distance order i = 100 test_descriptors = [] for j in range(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(np.array([j, j * 2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index # order. q = DescriptorMemoryElement('query', 99) q.set_vector(np.array([0, 0], float)) r, dists = index.nn(q, n=i) # Because the data is one-dimensional, all of the cells will have # the same points (any division will just correspond to a point on # the line), and a cell can't have more than half of the points self.assertEqual(len(dists), i // 2) for j, d, dist in zip(range(i), r, dists): self.assertEqual(d.uuid(), j) np.testing.assert_equal(d.vector(), [j, j * 2])
def _known_ordered_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None): # make vectors to return in a known euclidean distance order i = 1000 test_descriptors = [] for j in range(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(np.array([j, j*2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) ftor_train_hook(test_descriptors) di = MemoryDescriptorIndex() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method='euclidean') index.build_index(test_descriptors) # Since descriptors were built in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', i) q.set_vector(np.array([0, 0], float)) # top result should have UUID == 0 (nearest to query) r, dists = index.nn(q, 5) self.assertEqual(r[0].uuid(), 0) self.assertEqual(r[1].uuid(), 1) self.assertEqual(r[2].uuid(), 2) self.assertEqual(r[3].uuid(), 3) self.assertEqual(r[4].uuid(), 4) # global search should be in complete order r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): self.assertEqual(d.uuid(), j)
def test_update_index_additive(self): n1 = 100 n2 = 10 dim = 8 set1 = {DescriptorMemoryElement('test', i) for i in range(n1)} set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)} [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)] # Create and build initial index. index = self._make_inst() index.build_index(set1) self.assertEqual(index.count(), len(set1)) for d in set1: self.assertIn(d, index._descriptor_set) # Update and check that all intended descriptors are present in index. index.update_index(set2) set_all = set1 | set2 self.assertEqual(index.count(), len(set_all)) for d in set_all: self.assertIn(d, index._descriptor_set) # Check that NN can return something from the updated set. # - nearest element to the query element when the query is in the index # should be the query element. for q in set2: n_elems, n_dists = index.nn(q) self.assertEqual(n_elems[0], q)
def test_nn_known_descriptors_hik_unit(self): dim = 5 ### # Unit vectors - Equal distance # index = self._make_inst('hik') test_descriptors = [] for i in range(dim): v = numpy.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) index.build_index(test_descriptors) # query with zero vector # -> all modeled descriptors have no intersection, dists should be # 1.0, or maximum distance by histogram intersection. q = DescriptorMemoryElement('query', 0) q.set_vector(numpy.zeros(dim, float)) r, dists = index.nn(q, dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.) # query with index element q = test_descriptors[3] r, dists = index.nn(q, 1) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.) r, dists = index.nn(q, dim) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.)
def test_known_descriptors_euclidean_ordered(self): index = self._make_inst('euclidean') # make vectors to return in a known euclidean distance order i = 1000 test_descriptors = [] for j in xrange(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(numpy.array([j, j * 2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', i) q.set_vector(numpy.array([0, 0], float)) # top result should have UUID == 0 (nearest to query) r, dists = index.nn(q, 5) ntools.assert_equal(r[0].uuid(), 0) ntools.assert_equal(r[1].uuid(), 1) ntools.assert_equal(r[2].uuid(), 2) ntools.assert_equal(r[3].uuid(), 3) ntools.assert_equal(r[4].uuid(), 4) # global search should be in complete order r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): ntools.assert_equal(d.uuid(), j)
def test_feedback_results_has_results_post_reset(self): """ Test that an empty list is returned after a reset where there was a cached value before the reset. """ # Mocking results map existing for return. d0 = DescriptorMemoryElement('', 0).set_vector([0]) d1 = DescriptorMemoryElement('', 1).set_vector([1]) d2 = DescriptorMemoryElement('', 2).set_vector([2]) d3 = DescriptorMemoryElement('', 3).set_vector([3]) self.iqrs.feedback_list = { d0, d1, d2, d3, } # Initial call to ``ordered_results`` should have a non-None return. assert self.iqrs.feedback_results() is not None self.iqrs.reset() # Post-reset, there should be no results nor cache. actual = self.iqrs.feedback_results() assert actual == []
def test_build_index_fresh_build(self): descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for i, d in enumerate(descriptors): d.set_vector(np.ones(1, float) * i) index.build_index(descriptors) # Make sure descriptors are now in attached index and in # key-value-store. self.assertEqual(descr_set.count(), 5) for d in descriptors: self.assertIn(d, descr_set) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_nn_small_leaves(self): np.random.seed(0) n = 10**4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_set = [DescriptorMemoryElement('test', i) for i in range(n)] [d.set_vector(np.random.rand(dim)) for d in d_set] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim, ))) di = MemoryDescriptorSet() mrpt = MRPTNearestNeighborsIndex(di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_set) nbrs, dists = mrpt.nn(q, k) self.assertEqual(len(nbrs), len(dists)) self.assertEqual(len(nbrs), k)
def test_update_index_no_existing_index(self): # Test that calling update_index with no existing index acts like # building the index fresh. This test is basically the same as # test_build_index_fresh_build but using update_index instead. descr_set = MemoryDescriptorSet() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) index.update_index(descriptors) # Make sure descriptors are now in attached index and in key-value-store self.assertEqual(descr_set.count(), 5) for d in descriptors: self.assertIn(d, descr_set) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_adjudicate_remove_pos_neg(self): """ Test that we can remove positive and negative adjudications using "un_*" parameters. """ iqrs = IqrSession() # Set initial state p0 = DescriptorMemoryElement('', 0).set_vector([0]) p1 = DescriptorMemoryElement('', 1).set_vector([1]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Set initial state iqrs.positive_descriptors = {p0, p1, p2} iqrs.negative_descriptors = {n3, n4} # "Un-Adjudicate" descriptors individually iqrs.adjudicate(un_positives=[p1]) assert iqrs.positive_descriptors == {p0, p2} assert iqrs.negative_descriptors == {n3, n4} iqrs.adjudicate(un_negatives=[n3]) assert iqrs.positive_descriptors == {p0, p2} assert iqrs.negative_descriptors == {n4} # "Un-Adjudicate" collectively iqrs.adjudicate(un_positives=[p0, p2], un_negatives=[n4]) assert iqrs.positive_descriptors == set() assert iqrs.negative_descriptors == set()
def test_nn_known_descriptors_euclidean_unit(self): dim = 5 ### # Unit vectors -- Equal distance # index = self._make_inst() test_descriptors = [] for i in range(dim): v = np.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) index.build_index(test_descriptors) # query descriptor -- zero vector # -> all modeled descriptors should be equally distant (unit # corners) q = DescriptorMemoryElement('query', 0) q.set_vector(np.zeros(dim, float)) r, dists = index.nn(q, n=dim) self.assertEqual(len(dists), dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.)
def test_ordered_results_has_results_post_reset(self): """ Test that an empty list is returned after a reset where there was a cached value before the reset. """ iqrs = IqrSession() # Mocking results map existing for return. d0 = DescriptorMemoryElement('', 0).set_vector([0]) d1 = DescriptorMemoryElement('', 1).set_vector([1]) d2 = DescriptorMemoryElement('', 2).set_vector([2]) d3 = DescriptorMemoryElement('', 3).set_vector([3]) iqrs.results = { d0: 0.0, d1: 0.8, d2: 0.2, d3: 0.4, } # Initial call to ``ordered_results`` should have a non-None return. assert iqrs.ordered_results() is not None iqrs.reset() # Post-reset, there should be no results nor cache. actual = iqrs.ordered_results() assert actual == []
def test_get_unadjudicated_relevancy(self): """ Test successfully getting results for descriptors that are positively adjudicated. """ # Mock controller interaction to get a mock IqrSession instance. self.app.controller.has_session_uuid = \ mock.MagicMock(return_value=True) self.app.controller.get_session = mock.MagicMock() # Mock IQR session instance to have # Mock results return to be something valid. d0 = DescriptorMemoryElement('', 0).set_vector([0]) d1 = DescriptorMemoryElement('', 1).set_vector([1]) d2 = DescriptorMemoryElement('', 2).set_vector([2]) self.app.controller.get_session().get_unadjudicated_relevancy \ .return_value = [ [d0, 0.3], [d2, 0.2], [d1, 0.1], ] test_sid = '0000' with self.app.test_client() as tc: r = tc.get('/get_unadjudicated_relevancy?sid={}'.format(test_sid)) self.assertStatusCode(r, 200) self.assertJsonMessageRegex(r, "success") r_json = r.json assert r_json['total'] == 3 assert r_json['results'] == [[0, 0.3], [2, 0.2], [1, 0.1]] self.app.controller.has_session_uuid.assert_called_once_with(test_sid)
def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None): # :param hash_ftor: Hash function class for generating hash codes for # descriptors. # :param hash_idx: Hash index instance to use in local LSH algo # instance. # :param ftor_train_hook: Function for training functor if necessary. # make random descriptors i = 1000 dim = 256 td = [] numpy.random.seed(self.RANDOM_SEED) for j in range(i): d = DescriptorMemoryElement('random', j) d.set_vector(numpy.random.rand(dim)) td.append(d) ftor_train_hook(td) di = MemoryDescriptorIndex() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method='euclidean') index.build_index(td) # test query from build set -- should return same descriptor when k=1 q = td[255] r, dists = index.nn(q, 1) ntools.assert_equal(r[0], q) # test query very near a build vector td_q = td[0] q = DescriptorMemoryElement('query', i) v = td_q.vector().copy() v_min = max(v.min(), 0.1) v[0] += v_min v[dim - 1] -= v_min q.set_vector(v) r, dists = index.nn(q, 1) ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector())) ntools.assert_equal(r[0], td_q) # random query q = DescriptorMemoryElement('query', i + 1) q.set_vector(numpy.random.rand(dim)) # for any query of size k, results should at least be in distance order r, dists = index.nn(q, 10) for j in range(1, len(dists)): ntools.assert_greater(dists[j], dists[j - 1]) r, dists = index.nn(q, i) for j in range(1, len(dists)): ntools.assert_greater(dists[j], dists[j - 1])
def test_simple_multiclass_classification(self): """ Test simple train and classify setup with 3 classes. """ # Fix random seed for deterministic testing. numpy.random.seed(0) N = 1000 LABEL_1 = 'p1' LABEL_2 = 'p2' LABEL_3 = 'p3' # Setup training dataset # - 1 dimensional for obvious separation, this is not a performance # test. train1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis] train2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis] train3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis] train1_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train1)] train2_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train2, start=len(train1_e))] train3_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train3, start=len(train1_e) + len(train2_e))] # Setup testing dataset test1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis] test2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis] test3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis] # Train and test classifier instance classifier = SkLearnLogisticRegression(random_state=0) classifier.train({ LABEL_1: train1_e, LABEL_2: train2_e, LABEL_3: train3_e, }) c_maps_l1 = list(classifier._classify_arrays(test1)) c_maps_l2 = list(classifier._classify_arrays(test2)) c_maps_l3 = list(classifier._classify_arrays(test3)) for v, m in zip(test1, c_maps_l1): assert m[LABEL_1] > max(m[LABEL_2], m[LABEL_3]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_1, m, v ) for v, m in zip(test2, c_maps_l2): assert m[LABEL_2] > max(m[LABEL_1], m[LABEL_3]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_2, m, v ) for v, m in zip(test3, c_maps_l3): assert m[LABEL_3] > max(m[LABEL_2], m[LABEL_1]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_3, m, v )
def test_remove_from_index_invalid_uid(self): # Test that attempting to remove a single invalid UID causes a key # error and does not affect index. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) # uid -> descriptor expected_dset_table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } # hash int -> set[uid] expected_kvs_table = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, } d_set = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Assert we have the correct expected values self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove descriptor with a UID we did not build with. self.assertRaisesRegexp( KeyError, '5', idx.remove_from_index, [5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove multiple UIDs, one valid and one invalid self.assertRaisesRegexp( KeyError, '5', idx.remove_from_index, [2, 5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
def test_classify_elements_missing_vector(self): """ Test that we get a ValueError when """ elems = [ DescriptorMemoryElement('', 0).set_vector([1, 2, 3]), DescriptorMemoryElement('', 0), # no set vector DescriptorMemoryElement('', 0).set_vector([4, 5, 6]), ] with pytest.raises(ValueError, match=r"no vector stored"): list(self.inst.classify_elements(elems))
def test_remove_from_index_shared_hashes_partial(self): """ Test that only some hashes are removed from the hash index, but not others when those hashes still refer to other descriptors. """ # Simulate initial state with some descriptor hashed to one value and # other descriptors hashed to another. # Vectors of length 1 for easy dummy hashing prediction. descriptors = [ DescriptorMemoryElement('t', 0).set_vector([0]), DescriptorMemoryElement('t', 1).set_vector([1]), DescriptorMemoryElement('t', 2).set_vector([2]), DescriptorMemoryElement('t', 3).set_vector([3]), DescriptorMemoryElement('t', 4).set_vector([4]), ] # Dummy hash function to do the simulated thing hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock( # Vectors of even sum hash to 0, odd to 1. side_effect=lambda vec: [vec.sum() % 2] ) d_set = MemoryDescriptorIndex() d_set._table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } hash2uid_kvs = MemoryKeyValueStore() hash2uid_kvs._table = { 0: {0, 2, 4}, 1: {1, 3}, } idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs) idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([1, 2, 3]) # Check that only one hash vector was passed to hash_index's removal # method (deque of hash-code vectors). idx.hash_index.remove_from_index.assert_called_once_with( collections.deque([ [1], ]) ) self.assertDictEqual(d_set._table, { 0: descriptors[0], 4: descriptors[4], }) self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
def test_remove_from_index_shared_hashes(self): """ Test that removing a descriptor (by UID) that shares a hash with other descriptors does not trigger removal of its hash. """ # Simulate descriptors all hashing to the same hash value: 0 hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool)) d_set = MemoryDescriptorSet() hash2uids_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs) # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) idx.build_index(descriptors) # We expect the descriptor-set and kvs to look like the following now: self.assertDictEqual( d_set._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}}) # Mock out hash index as if we had an implementation so we can check # call to its remove_from_index method. idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([2, 4]) # Only uid 2 and 4 descriptors should be gone from d-set, kvs should # still have the 0 key and its set value should only contain uids 0, 1 # and 3. `hash_index.remove_from_index` should not be called because # no hashes should be marked for removal. self.assertDictEqual(d_set._table, { 0: descriptors[0], 1: descriptors[1], 3: descriptors[3], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}}) idx.hash_index.remove_from_index.assert_not_called()
def test_input_immutability(self): # make sure that data stored is not susceptible to shifts in the # originating data matrix they were pulled from. # # Testing this with a single vector # v = numpy.random.rand(16) t = tuple(v.copy()) d = DescriptorMemoryElement('test', 0) d.set_vector(v) v[:] = 0 ntools.assert_true((v == 0).all()) ntools.assert_false(sum(t) == 0.) numpy.testing.assert_equal(d.vector(), t) # # Testing with matrix # m = numpy.random.rand(20, 16) v1 = m[3] v2 = m[15] v3 = m[19] # Save truth values of arrays as immutable tuples (copies) t1 = tuple(v1.copy()) t2 = tuple(v2.copy()) t3 = tuple(v3.copy()) d1 = DescriptorMemoryElement('test', 1) d1.set_vector(v1) d2 = DescriptorMemoryElement('test', 2) d2.set_vector(v2) d3 = DescriptorMemoryElement('test', 3) d3.set_vector(v3) numpy.testing.assert_equal(v1, d1.vector()) numpy.testing.assert_equal(v2, d2.vector()) numpy.testing.assert_equal(v3, d3.vector()) # Changing the source should not change stored vectors m[:, :] = 0. ntools.assert_true((v1 == 0).all()) ntools.assert_true((v2 == 0).all()) ntools.assert_true((v3 == 0).all()) ntools.assert_false(sum(t1) == 0.) ntools.assert_false(sum(t2) == 0.) ntools.assert_false(sum(t3) == 0.) numpy.testing.assert_equal(d1.vector(), t1) numpy.testing.assert_equal(d2.vector(), t2) numpy.testing.assert_equal(d3.vector(), t3)
def test_update_index_nonzero_descriptors(self): index = DummySI() index._update_index = mock.MagicMock() # Testing with dummy input data. d_set = { DescriptorMemoryElement('test', 0), DescriptorMemoryElement('test', 1), DescriptorMemoryElement('test', 2), DescriptorMemoryElement('test', 3), } index.update_index(d_set) index._update_index.assert_called_once() self.assertSetEqual(set(index._update_index.call_args[0][0]), d_set)
def test_update_index_existing_descriptors_frozenset(self): """ Same as ``test_update_index_similar_descriptors`` but testing that we can update the index when seeded with structures with existing values. """ # Similar Descriptors to build and update on (different instances) descriptors1 = [ DescriptorMemoryElement('t', 0).set_vector([0]), DescriptorMemoryElement('t', 1).set_vector([1]), DescriptorMemoryElement('t', 2).set_vector([2]), DescriptorMemoryElement('t', 3).set_vector([3]), DescriptorMemoryElement('t', 4).set_vector([4]), ] descriptors2 = [ DescriptorMemoryElement('t', 5).set_vector([0]), DescriptorMemoryElement('t', 6).set_vector([1]), DescriptorMemoryElement('t', 7).set_vector([2]), DescriptorMemoryElement('t', 8).set_vector([3]), DescriptorMemoryElement('t', 9).set_vector([4]), ] descr_set = MemoryDescriptorSet() descr_set.add_many_descriptors(descriptors1) hash_kvs = MemoryKeyValueStore() hash_kvs.add(0, frozenset({0})) hash_kvs.add(1, frozenset({1})) hash_kvs.add(2, frozenset({2})) hash_kvs.add(3, frozenset({3})) hash_kvs.add(4, frozenset({4})) index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) index.update_index(descriptors2) assert descr_set.count() == 10 # Above descriptors should be considered "in" the descriptor set now. for d in descriptors1: assert d in descr_set for d in descriptors2: assert d in descr_set # Known hashes of the above descriptors should be in the KVS assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4} assert hash_kvs.get(0) == {0, 5} assert hash_kvs.get(1) == {1, 6} assert hash_kvs.get(2) == {2, 7} assert hash_kvs.get(3) == {3, 8} assert hash_kvs.get(4) == {4, 9}