def test_classify_missing_label(self):
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectC'])
        self.assertSetEqual(cm.exception.labels, {'subjectC'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectA', 'subjectC'])
        self.assertSetEqual(cm.exception.labels, {'subjectC'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectC', 'subjectD'])
        self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectA', 'subjectC', 'subjectD'])
        self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'})
Esempio n. 2
0
    def test_build_index_with_cache(self) -> None:
        # Empty memory data elements for storage
        empty_data = 'base64://'
        f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data)
        # Internal elements should initialize have zero-length byte values
        assert f._index_elem is not None
        assert f._index_param_elem is not None
        assert f._descr_cache_elem is not None
        self.assertEqual(len(f._index_elem.get_bytes()), 0)
        self.assertEqual(len(f._index_param_elem.get_bytes()), 0)
        self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0)

        # Make unit vectors, one for each feature dimension.
        dim = 8
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        f.build_index(test_descriptors)

        # Internal elements should not have non-zero byte values.
        self.assertGreater(len(f._index_elem.get_bytes()), 0)
        self.assertGreater(len(f._index_param_elem.get_bytes()), 0)
        self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
    def test_classify(self):
        """ Test invoking `classify` in a valid manner. """
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)
        result = ccol.classify(d)

        # Should contain one entry for each configured classifier.
        self.assertEqual(len(result), 2)
        self.assertIn('subjectA', result)
        self.assertIn('subjectB', result)
        # Each key should map to a classification element (memory in this case
        # because we're using the default factory)
        self.assertIsInstance(result['subjectA'], MemoryClassificationElement)
        self.assertIsInstance(result['subjectB'], MemoryClassificationElement)
        # We know the dummy classifier outputs "classifications" in a
        # deterministic way: class label is "test" and classification
        # value is the index of the descriptor .
        self.assertDictEqual(result['subjectA'].get_classification(),
                             {'test': 0})
        self.assertDictEqual(result['subjectB'].get_classification(),
                             {'test': 0})
Esempio n. 4
0
    def test_remove_from_index(self) -> None:
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement(0),
            DescriptorMemoryElement(1),
            DescriptorMemoryElement(2),
            DescriptorMemoryElement(3),
            DescriptorMemoryElement(4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        assert isinstance(idx.descriptor_set, MemoryDescriptorSet)
        self.assertEqual(idx.descriptor_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            4: descriptors[4],
        })
        assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore)
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
Esempio n. 5
0
 def create_de(v: np.ndarray) -> DescriptorElement:
     nonlocal i
     # Hopefully type_str doesn't matter
     de = DescriptorMemoryElement(i)
     de.set_vector(v)
     i += 1
     return de
Esempio n. 6
0
    def test_update_index_additive(self) -> None:
        n1 = 100
        n2 = 10
        dim = 8
        set1 = {DescriptorMemoryElement('test', i) for i in range(n1)}
        set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)}
        [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)]

        # Create and build initial index.
        index = self._make_inst()
        index.build_index(set1)
        self.assertEqual(index.count(), len(set1))
        for d in set1:
            self.assertIn(d, index._descriptor_set)

        # Update and check that all intended descriptors are present in index.
        index.update_index(set2)
        set_all = set1 | set2
        self.assertEqual(index.count(), len(set_all))
        for d in set_all:
            self.assertIn(d, index._descriptor_set)

        # Check that NN can return something from the updated set.
        # - nearest element to the query element when the query is in the index
        #   should be the query element.
        for q in set2:
            n_elems, n_dists = index.nn(q)
            self.assertEqual(n_elems[0], q)
    def test_classify_subset(self):
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        classifierB = ccol._label_to_classifier['subjectB']
        classifierB.classify_one_element = mock.Mock()

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)
        result = ccol.classify(d, labels=['subjectA'])

        # Should contain one entry for each requested classifier.
        self.assertEqual(len(result), 1)
        self.assertIn('subjectA', result)
        self.assertNotIn('subjectB', result)
        classifierB.classify_one_element.assert_not_called()
        # Each key should map to a classification element (memory in this case
        # because we're using the default factory)
        self.assertIsInstance(result['subjectA'], MemoryClassificationElement)
        # We know the dummy classifier outputs "classifications" in a
        # deterministic way: class label is descriptor UUID and classification
        # value is its vector as a list.
        self.assertDictEqual(result['subjectA'].get_classification(),
                             {'test': 0})
Esempio n. 8
0
    def test_update_index_no_existing_index(self) -> None:
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Esempio n. 9
0
    def test_nn_small_leaves(self) -> None:
        np.random.seed(0)

        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        [d.set_vector(np.random.rand(dim)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        self.assertEqual(len(nbrs), k)
Esempio n. 10
0
    def test_count_empty_hash2uid(self) -> None:
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_set.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 3)
Esempio n. 11
0
    def test_nn_known_descriptors_euclidean_ordered(self) -> None:
        index = self._make_inst()

        # make vectors to return in a known euclidean distance order
        i = 100
        test_descriptors = [
            DescriptorMemoryElement(j).set_vector(
                np.array([j, j * 2], float)
            )
            for j in range(i)
        ]
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index
        # order.
        q = DescriptorMemoryElement(99)
        q.set_vector(np.array([0, 0], float))
        r, dists = index.nn(q, n=i)

        self.assertEqual(len(dists), i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
            np.testing.assert_equal(d.vector(), [j, j*2])
Esempio n. 12
0
    def test_fit_with_cache(self) -> None:
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement('test', i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(DataMemoryElement(),
                         DataMemoryElement(),
                         bit_length=1,
                         random_seed=0)
        assert itq.mean_vec_cache_elem is not None
        assert itq.rotation_cache_elem is not None
        itq.fit(fit_descriptors)

        # TODO: Explanation as to why this is the expected result.
        numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0])
        numpy.testing.assert_array_almost_equal(itq.rotation,
                                                [[1 / sqrt(2)], [1 / sqrt(2)]])
        self.assertIsNotNone(itq.mean_vec_cache_elem)
        # noinspection PyTypeChecker
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0])

        self.assertIsNotNone(itq.rotation_cache_elem)
        # noinspection PyTypeChecker
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())),
            [[1 / sqrt(2)], [1 / sqrt(2)]])
Esempio n. 13
0
    def test_get_hash(self) -> None:
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement('test', i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        # The following "rotation" matrix should cause any 2-feature descriptor
        # to the right of the line ``y = -x`` to be True, and to the left as
        # False. If on the line, should be True.
        itq = ItqFunctor(bit_length=1, random_seed=0)
        itq.mean_vec = numpy.array([0., 0.])
        itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])),
                                         [False])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1.001, 1])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1.001])), [True])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1.001])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1.001, -1])), [True])
Esempio n. 14
0
    def test_fit_short_descriptors_for_bit_length(self) -> None:
        # Should error when input descriptors have fewer dimensions than set bit
        # length for output hash codes (limitation of PCA method currently
        # used).
        fit_descriptors = []
        for i in range(3):
            d = DescriptorMemoryElement('test', i)
            d.set_vector([-1 + i, -1 + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(bit_length=8)
        self.assertRaisesRegex(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, fit_descriptors)
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)

        # Should behave the same when input is an iterable
        self.assertRaisesRegex(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, iter(fit_descriptors))
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)
Esempio n. 15
0
    def test_nn_pathological_example(self) -> None:
        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        # Put all descriptors on a line so that different trees get same
        # divisions.
        # noinspection PyTypeChecker
        [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        # We should get about 10 descriptors back instead of the requested
        # 200
        self.assertLess(len(nbrs), 20)
Esempio n. 16
0
    def test_nn_known_descriptors_euclidean_ordered(self) -> None:
        index = self._make_inst()

        # make vectors to return in a known euclidean distance order
        i = 100
        test_descriptors = [
            DescriptorMemoryElement('ordered',
                                    j).set_vector(np.array([j, j * 2], float))
            for j in range(i)
        ]
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index
        # order.
        q = DescriptorMemoryElement('query', 99)
        q.set_vector(np.array([0, 0], float))
        r, dists = index.nn(q, n=i)
        # Because the data is one-dimensional, all of the cells will have
        # the same points (any division will just correspond to a point on
        # the line), and a cell can't have more than half of the points
        self.assertEqual(len(dists), i // 2)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
            np.testing.assert_equal(d.vector(), [j, j * 2])
Esempio n. 17
0
    def test_remove_then_add(self) -> None:
        """
        Test that we can remove from the index and then add to it again.
        """
        n1 = 100
        n2 = 10
        dim = 8
        set1 = [DescriptorMemoryElement(i) for i in range(n1)]
        set2 = [DescriptorMemoryElement(i)
                for i in range(n1, n1 + n2)]
        [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)]
        uids_to_remove = [10, 98]

        index = self._make_inst()
        index.build_index(set1)
        index.remove_from_index(uids_to_remove)
        index.update_index(set2)

        self.assertEqual(len(index), 108)
        # Removed descriptors should not be in return queries.
        self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10])
        self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98])
        # Every other descriptor should be queryable
        for d in set1 + set2:
            if d.uuid() not in uids_to_remove:
                self.assertEqual(index.nn(d, 1)[0][0], d)
        self.assertEqual(index._next_index, 110)
Esempio n. 18
0
    def test_build_index_fresh_build(self) -> None:
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in
        # key-value-store.
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Esempio n. 19
0
    def test_nn_known_descriptors_euclidean_unit(self) -> None:
        dim = 5

        ###
        # Unit vectors -- Equal distance
        #
        index = self._make_inst()
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            test_descriptors.append(
                DescriptorMemoryElement(i).set_vector(v)
            )
        index.build_index(test_descriptors)
        # query descriptor -- zero vector
        # -> all modeled descriptors should be equally distant (unit
        # corners)
        q = DescriptorMemoryElement(0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, n=dim)
        self.assertEqual(len(dists), dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)
Esempio n. 20
0
    def test_nn_known_descriptors_hik_unit(self) -> None:
        dim = 5

        ###
        # Unit vectors - Equal distance
        #
        index = self._make_inst('hik')
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            test_descriptors.append(
                DescriptorMemoryElement('unit', i).set_vector(v)
            )
        index.build_index(test_descriptors)
        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be
        #    1.0, or maximum distance by histogram intersection.
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Esempio n. 21
0
    def test_remove_from_index_invalid_uid(self) -> None:
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement(0),
            DescriptorMemoryElement(1),
            DescriptorMemoryElement(2),
            DescriptorMemoryElement(3),
            DescriptorMemoryElement(4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        assert isinstance(idx.descriptor_set, MemoryDescriptorSet)
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegex(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegex(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
Esempio n. 22
0
 def test_build_index_one(self) -> None:
     d = DescriptorMemoryElement(0)
     d.set_vector(numpy.zeros(8, float))
     index = self._make_inst('euclidean')
     index.build_index([d])
     self.assertListEqual(index._descr_cache, [d])
     self.assertIsNotNone(index._flann)
     self.assertIsInstance(index._flann_build_params, dict)
Esempio n. 23
0
    def test_read_only(self) -> None:
        v = np.zeros(5, float)
        v[0] = 1.
        d = DescriptorMemoryElement('unit', 0)
        d.set_vector(v)
        test_descriptors = [d]

        index = self._make_inst(read_only=True)
        self.assertRaises(ReadOnlyError, index.build_index, test_descriptors)
 def test_classify_elements_missing_vector(self) -> None:
     """ Test that we get a ValueError when """
     elems = [
         DescriptorMemoryElement('', 0).set_vector([1, 2, 3]),
         DescriptorMemoryElement('', 0),  # no set vector
         DescriptorMemoryElement('', 0).set_vector([4, 5, 6]),
     ]
     with pytest.raises(ValueError, match=r"no vector stored"):
         list(self.inst.classify_elements(elems))
Esempio n. 25
0
    def test_remove_from_index_shared_hashes_partial(self) -> None:
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement(0).set_vector([0]),
            DescriptorMemoryElement(1).set_vector([1]),
            DescriptorMemoryElement(2).set_vector([2]),
            DescriptorMemoryElement(3).set_vector([3]),
            DescriptorMemoryElement(4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(  # type: ignore
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorSet()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
Esempio n. 26
0
    def test_remove_from_index_shared_hashes(self) -> None:
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray(
            [0], bool))  # type: ignore

        d_set = MemoryDescriptorSet()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(
            d_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                3: descriptors[3],
                4: descriptors[4],
            })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
Esempio n. 27
0
    def test_nn_empty_index(self) -> None:
        # nn should fail if index size is 0
        index = DummySI()
        # noinspection PyTypeHints
        index.count = mock.MagicMock(return_value=0)  # type: ignore
        # noinspection PyTypeHints
        index._nn = mock.MagicMock()  # type: ignore

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        self.assertRaises(ValueError, index.nn, q)
Esempio n. 28
0
    def test_nn_normal_conditions(self) -> None:
        index = DummySI()
        # Need to force a non-zero index size for knn to be performed.
        # noinspection PyTypeHints
        index.count = mock.MagicMock()  # type: ignore
        index.count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        # Basically this shouldn't crash
        index.nn(q)
Esempio n. 29
0
    def _known_unit(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        dist_method: str,
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            test_descriptors.append(
                DescriptorMemoryElement('unit', i).set_vector(v))

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Esempio n. 30
0
    def test_persistence_with_update_index(self) -> None:
        n1 = 100
        n2 = 10
        dim = 8
        set1 = {DescriptorMemoryElement(i) for i in range(n1)}
        set2 = {DescriptorMemoryElement(i)
                for i in range(n1, n1+n2)}
        [d.set_vector(np.random.rand(dim)) for d in (set1 | set2)]

        # Create index with persistent entities
        index_element = DataMemoryElement(
            content_type='application/octet-stream')
        index_param_element = DataMemoryElement(
            content_type='text/plain')
        index = self._make_inst(
            index_element=index_element,
            index_param_element=index_param_element)
        descriptor_set = index._descriptor_set
        idx2uid_kvs = index._idx2uid_kvs
        uid2idx_kvs = index._uid2idx_kvs

        # Build initial index.
        index.build_index(set1)
        self.assertEqual(index.count(), len(set1))
        for d in set1:
            self.assertIn(d, index._descriptor_set)

        # Update and check that all intended descriptors are present in
        # index.
        index.update_index(set2)
        set_all = set1 | set2
        self.assertEqual(index.count(), len(set_all))
        for d in set_all:
            self.assertIn(d, index._descriptor_set)

        del index
        index = self._make_inst(
            descriptor_set=descriptor_set,
            idx2uid_kvs=idx2uid_kvs,
            uid2idx_kvs=uid2idx_kvs,
            index_element=index_element,
            index_param_element=index_param_element)

        # Check that NN can return something from the updated set.
        # - nearest element to the query element when the query is in the
        #   index should be the query element.
        for q in set_all:
            n_elems, n_dists = index.nn(q)
            self.assertEqual(n_elems[0], q)