Esempio n. 1
0
 def test_save_model_with_cache(self, m_savez: mock.MagicMock) -> None:
     cache_element = DataMemoryElement()
     bt = SkLearnBallTreeHashIndex(cache_element, random_seed=0)
     m = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)
     bt._build_bt_internal(m)
     self.assertTrue(m_savez.called)
     self.assertEqual(m_savez.call_count, 1)
Esempio n. 2
0
 def test_save_model_no_cache(self, m_savez: mock.MagicMock) -> None:
     bt = SkLearnBallTreeHashIndex()
     m = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)
     bt._build_bt_internal(m)
     # Underlying serialization function should not have been called
     # because no cache element set.
     self.assertFalse(m_savez.called)
Esempio n. 3
0
    def test_count_nonempty(self) -> None:
        bt = SkLearnBallTreeHashIndex()
        # Make 1000 random bit vectors of length 256
        m = np.random.randint(0, 2, 234 * 256).reshape(234, 256)
        bt.build_index(m)

        self.assertEqual(bt.count(), 234)
Esempio n. 4
0
    def test_get_config(self) -> None:
        bt = SkLearnBallTreeHashIndex()
        bt_c = bt.get_config()

        self.assertEqual(len(bt_c), 3)
        self.assertIn('cache_element', bt_c)
        self.assertIn('leaf_size', bt_c)
        self.assertIn('random_seed', bt_c)

        self.assertIsInstance(bt_c['cache_element'], dict)
        self.assertIsNone(bt_c['cache_element']['type'])
Esempio n. 5
0
    def test_init_consistency(self) -> None:
        # Test that constructing an instance with a configuration yields the
        # same config via ``get_config``.

        # - Default config should be a valid configuration for this impl.
        c = SkLearnBallTreeHashIndex.get_default_config()
        self.assertEqual(
            SkLearnBallTreeHashIndex.from_config(c).get_config(), c)
        # With non-null cache element
        c['cache_element'][
            'type'] = 'smqtk_dataprovider.impls.data_element.memory.DataMemoryElement'
        self.assertEqual(
            SkLearnBallTreeHashIndex.from_config(c).get_config(), c)
Esempio n. 6
0
    def test_build_index(self) -> None:
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        # Make 1000 random bit vectors of length 256
        m = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)
        bt.build_index(m)
        assert bt.bt is not None, (
            "Internal ball-tree structure should be constructed at this point."
        )

        # deterministically sort index of built and source data to determine
        # that an index was built.
        self.assertIsNotNone(bt.bt)
        np.testing.assert_array_almost_equal(
            sorted(np.array(bt.bt.data).tolist()), sorted(m.tolist()))
Esempio n. 7
0
    def test_remove_from_index_invalid_key_single(self) -> None:
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        assert bt.bt is not None, (
            "Internal ball-tree structure should be constructed at this point."
        )
        # Copy post-build index for checking no removal occurred
        bt_data = np.copy(bt.bt.data)

        self.assertRaises(KeyError, bt.remove_from_index, [
            int_to_bit_vector_large(1001, 256),
        ])
        np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
Esempio n. 8
0
 def test_default_configuration(self) -> None:
     c = SkLearnBallTreeHashIndex.get_default_config()
     self.assertEqual(len(c), 3)
     self.assertIsInstance(c['cache_element'], dict)
     self.assertIsNone(c['cache_element']['type'])
     self.assertEqual(c['leaf_size'], 40)
     self.assertIsNone(c['random_seed'])
Esempio n. 9
0
 def test_init_without_cache(self) -> None:
     i = SkLearnBallTreeHashIndex(cache_element=None,
                                  leaf_size=52,
                                  random_seed=42)
     self.assertIsNone(i.cache_element)
     self.assertEqual(i.leaf_size, 52)
     self.assertEqual(i.random_seed, 42)
     self.assertIsNone(i.bt)
Esempio n. 10
0
 def test_init_with_empty_cache(self) -> None:
     empty_cache = DataMemoryElement()
     i = SkLearnBallTreeHashIndex(cache_element=empty_cache,
                                  leaf_size=52,
                                  random_seed=42)
     self.assertEqual(i.cache_element, empty_cache)
     self.assertEqual(i.leaf_size, 52)
     self.assertEqual(i.random_seed, 42)
     self.assertIsNone(i.bt)
Esempio n. 11
0
    def test_remove_from_index(self) -> None:
        # Test that we actually remove from the index.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        assert bt.bt is not None, (
            "Internal ball-tree structure should be constructed at this point."
        )
        # BallTree data should now contain 1000 entries
        self.assertEqual(bt.bt.data.shape, (1000, 256))

        bt.remove_from_index([
            int_to_bit_vector_large(42, 256),
            int_to_bit_vector_large(998, 256),
        ])
        # Make sure data block is of the expected shape (two rows shorter)
        new_data = np.asarray(bt.bt.data)
        self.assertEqual(new_data.shape, (998, 256))
        # Make sure expected arrays are missing from data block.
        new_data_set = set(tuple(r) for r in new_data.tolist())
        self.assertNotIn(tuple(int_to_bit_vector_large(42, 256)), new_data_set)
        self.assertNotIn(tuple(int_to_bit_vector_large(998, 256)),
                         new_data_set)
Esempio n. 12
0
    def test_remove_from_index_last_element_with_cache(self) -> None:
        """
        Test removing final element also clears the cache element.
        """
        c = DataMemoryElement()
        bt = SkLearnBallTreeHashIndex(cache_element=c, random_seed=0)
        index = np.ndarray((1, 256), bool)
        index[0] = int_to_bit_vector_large(1, 256)

        bt.build_index(index)
        self.assertEqual(bt.count(), 1)
        self.assertFalse(c.is_empty())

        bt.remove_from_index(index)
        self.assertEqual(bt.count(), 0)
        self.assertTrue(c.is_empty())
Esempio n. 13
0
    def test_load_model(self) -> None:
        # Create two index instances, building model with one, and loading
        # the other with the cache of the first instance. Each should have
        # distinct model instances, but should otherwise have equal model
        # values and parameters.
        cache_element = DataMemoryElement()
        bt1 = SkLearnBallTreeHashIndex(cache_element, random_seed=0)
        m = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)
        bt1.build_index(m)

        bt2 = SkLearnBallTreeHashIndex(cache_element)
        self.assertIsNotNone(bt2.bt)

        q = np.random.randint(0, 2, 256).astype(bool)
        bt_neighbors, bt_dists = bt1.nn(q, 10)
        bt2_neighbors, bt2_dists = bt2.nn(q, 10)

        self.assertIsNot(bt1, bt2)
        self.assertIsNot(bt1.bt, bt2.bt)
        np.testing.assert_equal(bt2_neighbors, bt_neighbors)
        np.testing.assert_equal(bt2_dists, bt_dists)
Esempio n. 14
0
    def test_update_index_additive(self) -> None:
        # Test updating an existing index, i.e. rebuilding using the union of
        # previous and new data.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        # Make 1000 random bit vectors of length 256
        m1 = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)\
               .astype(bool)
        m2 = np.random.randint(0, 2, 100 * 256).reshape(100, 256).astype(bool)

        # Build initial index
        bt.build_index(m1)
        assert bt.bt is not None, (
            "Internal ball-tree structure should be constructed at this point."
        )
        # Current model should only contain m1's data.
        np.testing.assert_array_almost_equal(
            sorted(np.array(bt.bt.data).tolist()), sorted(m1.tolist()))

        # "Update" index with new hashes
        bt.update_index(m2)
        # New model should contain the union of the data.
        np.testing.assert_array_almost_equal(
            sorted(np.array(bt.bt.data).tolist()),
            sorted(np.concatenate([m1, m2], 0).tolist()))
Esempio n. 15
0
 def test_remove_from_index_no_index(self) -> None:
     # A key error should be raised if there is no ball-tree index yet.
     bt = SkLearnBallTreeHashIndex(random_seed=0)
     rm_hash = np.random.randint(0, 2, 256)
     self.assertRaisesRegex(KeyError, str(rm_hash[0]), bt.remove_from_index,
                            [rm_hash])
Esempio n. 16
0
 def test_save_model_with_readonly_cache(self) -> None:
     cache_element = DataMemoryElement(readonly=True)
     bt = SkLearnBallTreeHashIndex(cache_element)
     m = np.random.randint(0, 2, 1000 * 256).reshape(1000, 256)
     self.assertRaises(ValueError, bt._build_bt_internal, m)
Esempio n. 17
0
 def test_is_usable(self) -> None:
     # Should always be true because major dependency (sklearn) is a package
     # requirement.
     self.assertTrue(SkLearnBallTreeHashIndex.is_usable())
Esempio n. 18
0
class TestLshIndexAlgorithms(unittest.TestCase):
    """
    Various tests on the ``nn`` method for different inputs and parameters.
    """

    RANDOM_SEED: int = 0

    def _make_ftor_itq(
        self,
        bits: int = 32
    ) -> Tuple[ItqFunctor, Callable[[Iterable[DescriptorElement]], None]]:
        itq_ftor = ItqFunctor(bit_length=bits, random_seed=self.RANDOM_SEED)

        def itq_fit(d_iter: Iterable[DescriptorElement]) -> None:
            itq_ftor.fit(d_iter)

        return itq_ftor, itq_fit

    # noinspection PyMethodMayBeStatic
    def _make_hi_linear(self) -> LinearHashIndex:
        return LinearHashIndex()

    def _make_hi_balltree(self) -> SkLearnBallTreeHashIndex:
        return SkLearnBallTreeHashIndex(random_seed=self.RANDOM_SEED)

    #
    # Test LSH with random vectors
    #
    def _random_euclidean(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        np.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(np.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        td_q_v = td_q.vector()
        assert td_q_v is not None
        v = td_q_v.copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        self.assertFalse(np.array_equal(q.vector(), td_q.vector()))
        self.assertEqual(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i + 1)
        q.set_vector(np.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j - 1])

    def test_random_euclidean__itq__None(self) -> None:
        ftor, fit = self._make_ftor_itq()
        self._random_euclidean(ftor, None, fit)

    def test_random_euclidean__itq__linear(self) -> None:
        ftor, fit = self._make_ftor_itq()
        hi = self._make_hi_linear()
        self._random_euclidean(ftor, hi, fit)

    @pytest.mark.skipif(
        not SkLearnBallTreeHashIndex.is_usable(),
        reason=
        "SkLearnBallTreeHashIndex is not usable in the current environment.")
    def test_random_euclidean__itq__balltree(self) -> None:
        ftor, fit = self._make_ftor_itq()
        hi = self._make_hi_balltree()
        self._random_euclidean(ftor, hi, fit)

    #
    # Test unit vectors
    #
    def _known_unit(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        dist_method: str,
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            test_descriptors.append(
                DescriptorMemoryElement('unit', i).set_vector(v))

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

    def test_known_unit__euclidean__itq__None(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        self._known_unit(ftor, None, 'euclidean', fit)

    def test_known_unit__hik__itq__None(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        self._known_unit(ftor, None, 'hik', fit)

    def test_known_unit__euclidean__itq__linear(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        hi = self._make_hi_linear()
        self._known_unit(ftor, hi, 'euclidean', fit)

    def test_known_unit__hik__itq__linear(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        hi = self._make_hi_linear()
        self._known_unit(ftor, hi, 'hik', fit)

    @pytest.mark.skipif(
        not SkLearnBallTreeHashIndex.is_usable(),
        reason=
        "SkLearnBallTreeHashIndex is not usable in the current environment.")
    def test_known_unit__euclidean__itq__balltree(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        hi = self._make_hi_balltree()
        self._known_unit(ftor, hi, 'euclidean', fit)

    @pytest.mark.skipif(
        not SkLearnBallTreeHashIndex.is_usable(),
        reason=
        "SkLearnBallTreeHashIndex is not usable in the current environment.")
    def test_known_unit__hik__itq__balltree(self) -> None:
        ftor, fit = self._make_ftor_itq(5)
        hi = self._make_hi_balltree()
        self._known_unit(ftor, hi, 'hik', fit)

    #
    # Test with known vectors and euclidean dist
    #
    def _known_ordered_euclidean(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = [
            DescriptorMemoryElement('ordered',
                                    j).set_vector(np.array([j, j * 2], float))
            for j in range(i)
        ]
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)

    def test_known_ordered_euclidean__itq__None(self) -> None:
        ftor, fit = self._make_ftor_itq(1)
        self._known_ordered_euclidean(ftor, None, fit)

    def test_known_ordered_euclidean__itq__linear(self) -> None:
        ftor, fit = self._make_ftor_itq(1)
        hi = self._make_hi_linear()
        self._known_ordered_euclidean(ftor, hi, fit)

    @pytest.mark.skipif(
        not SkLearnBallTreeHashIndex.is_usable(),
        reason=
        "SkLearnBallTreeHashIndex is not usable in the current environment.")
    def test_known_ordered_euclidean__itq__balltree(self) -> None:
        ftor, fit = self._make_ftor_itq(1)
        hi = self._make_hi_balltree()
        self._known_ordered_euclidean(ftor, hi, fit)
Esempio n. 19
0
 def test_count_empty(self) -> None:
     bt = SkLearnBallTreeHashIndex()
     self.assertEqual(bt.count(), 0)
Esempio n. 20
0
    def test_remove_from_index_last_element(self) -> None:
        """
        Test removing the final the only element / final elements from the
        index.
        """
        # Add one hash, remove one hash.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1, 256), bool)
        index[0] = int_to_bit_vector_large(1, 256)
        bt.build_index(index)
        self.assertEqual(bt.count(), 1)
        bt.remove_from_index(index)
        self.assertEqual(bt.count(), 0)
        self.assertIsNone(bt.bt)

        # Add many hashes, remove many hashes in batches until zero
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        # Remove first 250
        bt.remove_from_index(index[:250])
        self.assertEqual(bt.count(), 750)
        self.assertIsNotNone(bt.bt)
        # Remove second 250
        bt.remove_from_index(index[250:500])
        self.assertEqual(bt.count(), 500)
        self.assertIsNotNone(bt.bt)
        # Remove third 250
        bt.remove_from_index(index[500:750])
        self.assertEqual(bt.count(), 250)
        self.assertIsNotNone(bt.bt)
        # Remove final 250
        bt.remove_from_index(index[750:])
        self.assertEqual(bt.count(), 0)
        self.assertIsNone(bt.bt)
Esempio n. 21
0
 def test_update_index_no_input(self) -> None:
     bt = SkLearnBallTreeHashIndex(random_seed=0)
     self.assertRaises(ValueError, bt.update_index, [])
Esempio n. 22
0
    def test_nn_no_index(self) -> None:
        i = SkLearnBallTreeHashIndex()

        self.assertRaisesRegex(ValueError,
                               "No index currently set to query from", i.nn,
                               [0, 0, 0])
Esempio n. 23
0
 def _make_hi_balltree(self) -> SkLearnBallTreeHashIndex:
     return SkLearnBallTreeHashIndex(random_seed=self.RANDOM_SEED)
Esempio n. 24
0
import unittest
import unittest.mock as mock

import numpy as np
import pytest

from smqtk_dataprovider.impls.data_element.memory import DataMemoryElement
from smqtk_indexing.impls.hash_index.sklearn_balltree import SkLearnBallTreeHashIndex
from smqtk_indexing.utils.bits import int_to_bit_vector_large


@pytest.mark.skipif(
    not SkLearnBallTreeHashIndex.is_usable(),
    reason="SkLearnBallTreeHashIndex is usable in the current environment.")
class TestBallTreeHashIndex(unittest.TestCase):
    def test_is_usable(self) -> None:
        # Should always be true because major dependency (sklearn) is a package
        # requirement.
        self.assertTrue(SkLearnBallTreeHashIndex.is_usable())

    def test_default_configuration(self) -> None:
        c = SkLearnBallTreeHashIndex.get_default_config()
        self.assertEqual(len(c), 3)
        self.assertIsInstance(c['cache_element'], dict)
        self.assertIsNone(c['cache_element']['type'])
        self.assertEqual(c['leaf_size'], 40)
        self.assertIsNone(c['random_seed'])

    def test_init_without_cache(self) -> None:
        i = SkLearnBallTreeHashIndex(cache_element=None,
                                     leaf_size=52,