Ejemplo n.º 1
0
 def test_remove_from_index(self):
     # Test that actual removal occurs.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     # noinspection PyTypeChecker
     i.remove_from_index([[0, 0], [1, 0]])
     self.assertSetEqual(i.index, {1})
Ejemplo n.º 2
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple
            of the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        super(LSHNearestNeighborIndex, self).nn(d, n)

        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        self._log.debug("getting near hashes")
        hi = self.hash_index
        # Make on-the-fly linear index if we weren't originally set with one
        if hi is None:
            hi = LinearHashIndex()
            # not calling ``build_index`` because we already have the int
            # hashes.
            with self._hash2uuid_lock:
                hi.index = numpy.array(self._hash2uuid.keys())
        hashes, hash_dists = hi.nn(d_h, n)

        self._log.debug("getting UUIDs of descriptors for nearby hashes")
        neighbor_uuids = []
        with self._hash2uuid_lock:
            for h_int in map(bit_vector_to_int_large, hashes):
                # If descriptor hash not in our map, we effectively skip it
                neighbor_uuids.extend(self._hash2uuid.get(h_int, ()))
        self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

        self._log.debug("getting descriptors for neighbor_uuids")
        neighbors = \
            list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = map(comp_descr_dist, neighbor_vectors)
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return zip(*(ordered[:n]))
Ejemplo n.º 3
0
 def test_remove_from_index(self):
     # Test that actual removal occurs.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     # noinspection PyTypeChecker
     i.remove_from_index([[0, 0],
                          [1, 0]])
     self.assertSetEqual(i.index, {1})
Ejemplo n.º 4
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple
            of the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        super(LSHNearestNeighborIndex, self).nn(d, n)

        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        self._log.debug("getting near hashes")
        hi = self.hash_index
        if hi is None:
            # Make on-the-fly linear index
            hi = LinearHashIndex()
            # not calling ``build_index`` because we already have the int
            # hashes.
            hi.index = numpy.array(list(self.hash2uuids_kvstore.keys()))
        near_hashes, _ = hi.nn(d_h, n)

        self._log.debug("getting UUIDs of descriptors for nearby hashes")
        neighbor_uuids = []
        for h_int in map(bit_vector_to_int_large, near_hashes):
            # If descriptor hash not in our map, we effectively skip it
            #: :type: collections.Iterable
            near_uuids = self.hash2uuids_kvstore.get(h_int, ())
            neighbor_uuids.extend(near_uuids)
        self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

        self._log.debug("getting descriptors for neighbor_uuids")
        neighbors = \
            list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = map(comp_descr_dist, neighbor_vectors)
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances), key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return zip(*(ordered[:n]))
Ejemplo n.º 5
0
 def test_remove_from_index_single_not_in_index(self):
     # Test attempting to remove single hash not in the index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index,
         [[1, 0, 0]]  # 4
     )
     self.assertSetEqual(i.index, {0, 1, 2})
Ejemplo n.º 6
0
 def test_remove_from_index_single_not_in_index(self):
     # Test attempting to remove single hash not in the index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index,
         [[1, 0, 0]]  # 4
     )
     self.assertSetEqual(i.index, {0, 1, 2})
Ejemplo n.º 7
0
 def test_remove_from_index_one_of_many_not_in_index(self):
     # Test attempting to remove hashes where one of them is not in the
     # index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index, [[0, 0],  # 0
                               [0, 1],  # 1
                               [1, 1]]  # 3
     )
     # Check that the index has not been modified.
     self.assertSetEqual(i.index, {0, 1, 2})
Ejemplo n.º 8
0
 def test_remove_from_index_one_of_many_not_in_index(self):
     # Test attempting to remove hashes where one of them is not in the
     # index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index,
         [
             [0, 0],  # 0
             [0, 1],  # 1
             [1, 1]
         ]  # 3
     )
     # Check that the index has not been modified.
     self.assertSetEqual(i.index, {0, 1, 2})
Ejemplo n.º 9
0
    def _nn(self, d, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            self._log.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = numpy.array(list(self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            self._log.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                #: :type: set[collections.Hashable]
                near_uuids = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

            self._log.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return list(zip(*(ordered[:n])))
Ejemplo n.º 10
0
    def _nn(self, d, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            self._log.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(self.hash2uuids_kvstore.keys())
            near_hashes, _ = hi.nn(d_h, n)

            self._log.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                #: :type: set[collections.Hashable]
                near_uuids = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

            self._log.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return list(zip(*(ordered[:n])))