Ejemplo n.º 1
0
    def test_pickle_dump_load(self):
        # Wipe current cache
        DescriptorMemoryElement.MEMORY_CACHE = {}

        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Wipe cache again
        DescriptorMemoryElement.MEMORY_CACHE = {}
        ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())

        # Cache should now have those entries back in it
        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
Ejemplo n.º 2
0
    def test_get_feedback(self):
        """
        Test successfully getting feedback results from a requested session.
        """
        # Mock controller interaction to get a mock IqrSession instance.
        self.app.controller.has_session_uuid = \
            mock.MagicMock(return_value=True)
        self.app.controller.get_session = mock.MagicMock()
        # Mock IQR session instance to have
        # Mock feedback_results return to be something valid.
        d0 = DescriptorMemoryElement('', 0).set_vector([0])
        d1 = DescriptorMemoryElement('', 1).set_vector([1])
        d2 = DescriptorMemoryElement('', 2).set_vector([2])
        self.app.controller.get_session().feedback_results.return_value = [
            d0, d2, d1
        ]

        test_sid = '0000'
        with self.app.test_client() as tc:
            r = tc.get('/get_feedback?sid={}'.format(test_sid))
            self.assertStatusCode(r, 200)
            self.assertJsonMessageRegex(r, "Returning feedback uuids")
            r_json = r.json
            assert r_json['total_results'] == 3
            assert r_json['results'] == [0, 2, 1]

        self.app.controller.has_session_uuid.assert_called_once_with(test_sid)
Ejemplo n.º 3
0
    def test_remove_from_index(self):
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        self.assertEqual(
            idx.descriptor_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                4: descriptors[4],
            })
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
Ejemplo n.º 4
0
    def test_nn_pathological_example(self):
        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        # Put all descriptors on a line so that different trees get same
        # divisions.
        # noinspection PyTypeChecker
        [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        # We should get about 10 descriptors back instead of the requested
        # 200
        self.assertLess(len(nbrs), 20)
Ejemplo n.º 5
0
    def test_adjudicate_add_duplicates(self):
        """
        Test that adding duplicate descriptors as positive or negative
        adjudications has no effect as the behavior of sets should be observed.
        """
        iqrs = IqrSession()

        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n1 = DescriptorMemoryElement('', 1).set_vector([1])
        p3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Partially add the above descriptors
        iqrs.adjudicate(new_positives=[p0], new_negatives=[n1])
        assert iqrs.positive_descriptors == {p0}
        assert iqrs.negative_descriptors == {n1}

        # Add all descriptors, observing that that already added descriptors
        # are ignored.
        iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4])
        assert iqrs.positive_descriptors == {p0, p2, p3}
        assert iqrs.negative_descriptors == {n1, n4}

        # Duplicate previous call so no new descriptors are added. No change or
        # issue should be observed.
        iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4])
        assert iqrs.positive_descriptors == {p0, p2, p3}
        assert iqrs.negative_descriptors == {n1, n4}
Ejemplo n.º 6
0
    def test_adjudication_switch(self):
        """
        Test providing positives and negatives on top of an existing state such
        that the descriptor adjudications are reversed. (what was once positive
        is now negative, etc.)
        """
        iqrs = IqrSession()

        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p1 = DescriptorMemoryElement('', 1).set_vector([1])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Set initial state
        iqrs.positive_descriptors = {p0, p1, p2}
        iqrs.negative_descriptors = {n3, n4}

        # Adjudicate, partially swapping adjudications individually
        iqrs.adjudicate(new_positives=[n3])
        assert iqrs.positive_descriptors == {p0, p1, p2, n3}
        assert iqrs.negative_descriptors == {n4}

        iqrs.adjudicate(new_negatives=[p1])
        assert iqrs.positive_descriptors == {p0, p2, n3}
        assert iqrs.negative_descriptors == {n4, p1}

        # Adjudicate swapping remaining at the same time
        iqrs.adjudicate(new_positives=[n4], new_negatives=[p0, p2])
        assert iqrs.positive_descriptors == {n3, n4}
        assert iqrs.negative_descriptors == {p0, p1, p2}
Ejemplo n.º 7
0
    def test_count_empty_hash2uid(self):
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_set.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_set.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_set.count(), 2)
        self.assertEqual(lsh.count(), 3)
Ejemplo n.º 8
0
    def test_remove_then_add(self):
        """
        Test that we can remove from the index and then add to it again.
        """
        n1 = 100
        n2 = 10
        dim = 8
        set1 = [DescriptorMemoryElement('test', i) for i in range(n1)]
        set2 = [DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)]
        [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)]
        uids_to_remove = [10, 98]

        index = self._make_inst()
        index.build_index(set1)
        index.remove_from_index(uids_to_remove)
        index.update_index(set2)

        self.assertEqual(len(index), 108)
        # Removed descriptors should not be in return queries.
        self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10])
        self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98])
        # Every other descriptor should be queryable
        for d in set1 + set2:
            if d.uuid() not in uids_to_remove:
                self.assertEqual(index.nn(d, 1)[0][0], d)
        self.assertEqual(index._next_index, 110)
Ejemplo n.º 9
0
    def test_nn_known_descriptors_euclidean_ordered(self):
        index = self._make_inst()

        # make vectors to return in a known euclidean distance order
        i = 100
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j * 2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index
        # order.
        q = DescriptorMemoryElement('query', 99)
        q.set_vector(np.array([0, 0], float))
        r, dists = index.nn(q, n=i)
        # Because the data is one-dimensional, all of the cells will have
        # the same points (any division will just correspond to a point on
        # the line), and a cell can't have more than half of the points
        self.assertEqual(len(dists), i // 2)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
            np.testing.assert_equal(d.vector(), [j, j * 2])
Ejemplo n.º 10
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
Ejemplo n.º 11
0
    def test_update_index_additive(self):
        n1 = 100
        n2 = 10
        dim = 8
        set1 = {DescriptorMemoryElement('test', i) for i in range(n1)}
        set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)}
        [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)]

        # Create and build initial index.
        index = self._make_inst()
        index.build_index(set1)
        self.assertEqual(index.count(), len(set1))
        for d in set1:
            self.assertIn(d, index._descriptor_set)

        # Update and check that all intended descriptors are present in index.
        index.update_index(set2)
        set_all = set1 | set2
        self.assertEqual(index.count(), len(set_all))
        for d in set_all:
            self.assertIn(d, index._descriptor_set)

        # Check that NN can return something from the updated set.
        # - nearest element to the query element when the query is in the index
        #   should be the query element.
        for q in set2:
            n_elems, n_dists = index.nn(q)
            self.assertEqual(n_elems[0], q)
Ejemplo n.º 12
0
    def test_nn_known_descriptors_hik_unit(self):
        dim = 5

        ###
        # Unit vectors - Equal distance
        #
        index = self._make_inst('hik')
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)
        index.build_index(test_descriptors)
        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be
        #    1.0, or maximum distance by histogram intersection.
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Ejemplo n.º 13
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst('euclidean')

        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in xrange(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(numpy.array([j, j * 2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(numpy.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        ntools.assert_equal(r[0].uuid(), 0)
        ntools.assert_equal(r[1].uuid(), 1)
        ntools.assert_equal(r[2].uuid(), 2)
        ntools.assert_equal(r[3].uuid(), 3)
        ntools.assert_equal(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
Ejemplo n.º 14
0
    def test_feedback_results_has_results_post_reset(self):
        """
        Test that an empty list is returned after a reset where there was a
        cached value before the reset.
        """

        # Mocking results map existing for return.
        d0 = DescriptorMemoryElement('', 0).set_vector([0])
        d1 = DescriptorMemoryElement('', 1).set_vector([1])
        d2 = DescriptorMemoryElement('', 2).set_vector([2])
        d3 = DescriptorMemoryElement('', 3).set_vector([3])
        self.iqrs.feedback_list = {
            d0,
            d1,
            d2,
            d3,
        }

        # Initial call to ``ordered_results`` should have a non-None return.
        assert self.iqrs.feedback_results() is not None

        self.iqrs.reset()

        # Post-reset, there should be no results nor cache.
        actual = self.iqrs.feedback_results()
        assert actual == []
Ejemplo n.º 15
0
    def test_build_index_fresh_build(self):
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in
        # key-value-store.
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Ejemplo n.º 16
0
    def test_nn_small_leaves(self):
        np.random.seed(0)

        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        [d.set_vector(np.random.rand(dim)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        self.assertEqual(len(nbrs), k)
Ejemplo n.º 17
0
    def test_update_index_no_existing_index(self):
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Ejemplo n.º 18
0
    def test_adjudicate_remove_pos_neg(self):
        """
        Test that we can remove positive and negative adjudications using
        "un_*" parameters.
        """
        iqrs = IqrSession()

        # Set initial state
        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p1 = DescriptorMemoryElement('', 1).set_vector([1])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Set initial state
        iqrs.positive_descriptors = {p0, p1, p2}
        iqrs.negative_descriptors = {n3, n4}

        # "Un-Adjudicate" descriptors individually
        iqrs.adjudicate(un_positives=[p1])
        assert iqrs.positive_descriptors == {p0, p2}
        assert iqrs.negative_descriptors == {n3, n4}
        iqrs.adjudicate(un_negatives=[n3])
        assert iqrs.positive_descriptors == {p0, p2}
        assert iqrs.negative_descriptors == {n4}

        # "Un-Adjudicate" collectively
        iqrs.adjudicate(un_positives=[p0, p2], un_negatives=[n4])
        assert iqrs.positive_descriptors == set()
        assert iqrs.negative_descriptors == set()
Ejemplo n.º 19
0
    def test_nn_known_descriptors_euclidean_unit(self):
        dim = 5

        ###
        # Unit vectors -- Equal distance
        #
        index = self._make_inst()
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)
        index.build_index(test_descriptors)
        # query descriptor -- zero vector
        # -> all modeled descriptors should be equally distant (unit
        # corners)
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, n=dim)
        self.assertEqual(len(dists), dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)
Ejemplo n.º 20
0
    def test_ordered_results_has_results_post_reset(self):
        """
        Test that an empty list is returned after a reset where there was a
        cached value before the reset.
        """
        iqrs = IqrSession()

        # Mocking results map existing for return.
        d0 = DescriptorMemoryElement('', 0).set_vector([0])
        d1 = DescriptorMemoryElement('', 1).set_vector([1])
        d2 = DescriptorMemoryElement('', 2).set_vector([2])
        d3 = DescriptorMemoryElement('', 3).set_vector([3])
        iqrs.results = {
            d0: 0.0,
            d1: 0.8,
            d2: 0.2,
            d3: 0.4,
        }

        # Initial call to ``ordered_results`` should have a non-None return.
        assert iqrs.ordered_results() is not None

        iqrs.reset()

        # Post-reset, there should be no results nor cache.
        actual = iqrs.ordered_results()
        assert actual == []
Ejemplo n.º 21
0
    def test_get_unadjudicated_relevancy(self):
        """
        Test successfully getting results for descriptors that are positively
        adjudicated.
        """
        # Mock controller interaction to get a mock IqrSession instance.
        self.app.controller.has_session_uuid = \
            mock.MagicMock(return_value=True)
        self.app.controller.get_session = mock.MagicMock()
        # Mock IQR session instance to have
        # Mock results return to be something valid.
        d0 = DescriptorMemoryElement('', 0).set_vector([0])
        d1 = DescriptorMemoryElement('', 1).set_vector([1])
        d2 = DescriptorMemoryElement('', 2).set_vector([2])
        self.app.controller.get_session().get_unadjudicated_relevancy \
            .return_value = [
                [d0, 0.3], [d2, 0.2], [d1, 0.1],
            ]

        test_sid = '0000'
        with self.app.test_client() as tc:
            r = tc.get('/get_unadjudicated_relevancy?sid={}'.format(test_sid))
            self.assertStatusCode(r, 200)
            self.assertJsonMessageRegex(r, "success")
            r_json = r.json
            assert r_json['total'] == 3
            assert r_json['results'] == [[0, 0.3], [2, 0.2], [1, 0.1]]

        self.app.controller.has_session_uuid.assert_called_once_with(test_sid)
Ejemplo n.º 22
0
    def _random_euclidean(self,
                          hash_ftor,
                          hash_idx,
                          ftor_train_hook=lambda d: None):
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        numpy.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i + 1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
    def test_simple_multiclass_classification(self):
        """ Test simple train and classify setup with 3 classes. """
        # Fix random seed for deterministic testing.
        numpy.random.seed(0)

        N = 1000
        LABEL_1 = 'p1'
        LABEL_2 = 'p2'
        LABEL_3 = 'p3'

        # Setup training dataset
        # - 1 dimensional for obvious separation, this is not a performance
        #   test.
        train1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis]
        train2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis]
        train3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis]

        train1_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v in enumerate(train1)]
        train2_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v in enumerate(train2, start=len(train1_e))]
        train3_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v
                    in enumerate(train3,
                                 start=len(train1_e) + len(train2_e))]

        # Setup testing dataset
        test1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis]
        test2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis]
        test3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis]

        # Train and test classifier instance
        classifier = SkLearnLogisticRegression(random_state=0)
        classifier.train({
            LABEL_1: train1_e,
            LABEL_2: train2_e,
            LABEL_3: train3_e,
        })
        c_maps_l1 = list(classifier._classify_arrays(test1))
        c_maps_l2 = list(classifier._classify_arrays(test2))
        c_maps_l3 = list(classifier._classify_arrays(test3))

        for v, m in zip(test1, c_maps_l1):
            assert m[LABEL_1] > max(m[LABEL_2], m[LABEL_3]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_1, m, v
                )
        for v, m in zip(test2, c_maps_l2):
            assert m[LABEL_2] > max(m[LABEL_1], m[LABEL_3]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_2, m, v
                )
        for v, m in zip(test3, c_maps_l3):
            assert m[LABEL_3] > max(m[LABEL_2], m[LABEL_1]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_3, m, v
                )
Ejemplo n.º 24
0
    def test_remove_from_index_invalid_uid(self):
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
Ejemplo n.º 25
0
 def test_classify_elements_missing_vector(self):
     """ Test that we get a ValueError when """
     elems = [
         DescriptorMemoryElement('', 0).set_vector([1, 2, 3]),
         DescriptorMemoryElement('', 0),  # no set vector
         DescriptorMemoryElement('', 0).set_vector([4, 5, 6]),
     ]
     with pytest.raises(ValueError, match=r"no vector stored"):
         list(self.inst.classify_elements(elems))
Ejemplo n.º 26
0
    def test_remove_from_index_shared_hashes_partial(self):
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorIndex()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
Ejemplo n.º 27
0
    def test_remove_from_index_shared_hashes(self):
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool))

        d_set = MemoryDescriptorSet()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(
            d_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                3: descriptors[3],
                4: descriptors[4],
            })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
Ejemplo n.º 28
0
    def test_input_immutability(self):
        # make sure that data stored is not susceptible to shifts in the
        # originating data matrix they were pulled from.

        #
        # Testing this with a single vector
        #
        v = numpy.random.rand(16)
        t = tuple(v.copy())
        d = DescriptorMemoryElement('test', 0)
        d.set_vector(v)
        v[:] = 0
        ntools.assert_true((v == 0).all())
        ntools.assert_false(sum(t) == 0.)
        numpy.testing.assert_equal(d.vector(), t)

        #
        # Testing with matrix
        #
        m = numpy.random.rand(20, 16)

        v1 = m[3]
        v2 = m[15]
        v3 = m[19]

        # Save truth values of arrays as immutable tuples (copies)
        t1 = tuple(v1.copy())
        t2 = tuple(v2.copy())
        t3 = tuple(v3.copy())

        d1 = DescriptorMemoryElement('test', 1)
        d1.set_vector(v1)
        d2 = DescriptorMemoryElement('test', 2)
        d2.set_vector(v2)
        d3 = DescriptorMemoryElement('test', 3)
        d3.set_vector(v3)

        numpy.testing.assert_equal(v1, d1.vector())
        numpy.testing.assert_equal(v2, d2.vector())
        numpy.testing.assert_equal(v3, d3.vector())

        # Changing the source should not change stored vectors
        m[:, :] = 0.
        ntools.assert_true((v1 == 0).all())
        ntools.assert_true((v2 == 0).all())
        ntools.assert_true((v3 == 0).all())
        ntools.assert_false(sum(t1) == 0.)
        ntools.assert_false(sum(t2) == 0.)
        ntools.assert_false(sum(t3) == 0.)
        numpy.testing.assert_equal(d1.vector(), t1)
        numpy.testing.assert_equal(d2.vector(), t2)
        numpy.testing.assert_equal(d3.vector(), t3)
Ejemplo n.º 29
0
    def test_update_index_nonzero_descriptors(self):
        index = DummySI()
        index._update_index = mock.MagicMock()

        # Testing with dummy input data.
        d_set = {
            DescriptorMemoryElement('test', 0),
            DescriptorMemoryElement('test', 1),
            DescriptorMemoryElement('test', 2),
            DescriptorMemoryElement('test', 3),
        }
        index.update_index(d_set)
        index._update_index.assert_called_once()
        self.assertSetEqual(set(index._update_index.call_args[0][0]), d_set)
Ejemplo n.º 30
0
    def test_update_index_existing_descriptors_frozenset(self):
        """
        Same as ``test_update_index_similar_descriptors`` but testing that
        we can update the index when seeded with structures with existing
        values.
        """
        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        descr_set = MemoryDescriptorSet()
        descr_set.add_many_descriptors(descriptors1)

        hash_kvs = MemoryKeyValueStore()
        hash_kvs.add(0, frozenset({0}))
        hash_kvs.add(1, frozenset({1}))
        hash_kvs.add(2, frozenset({2}))
        hash_kvs.add(3, frozenset({3}))
        hash_kvs.add(4, frozenset({4}))

        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)
        index.update_index(descriptors2)

        assert descr_set.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}