def test_nn(self) -> None: i = LinearHashIndex() # noinspection PyTypeChecker i.build_index([[0, 1, 0], [1, 1, 0], [0, 1, 1], [0, 0, 1]]) # noinspection PyTypeChecker near_codes, near_dists = i.nn([0, 0, 0], 4) self.assertEqual(set(map(tuple, near_codes[:2])), {(0, 1, 0), (0, 0, 1)}) self.assertEqual(set(map(tuple, near_codes[2:])), {(1, 1, 0), (0, 1, 1)}) numpy.testing.assert_array_almost_equal( near_dists, (1 / 3., 1 / 3., 2 / 3., 2 / 3.))
def _nn( self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ LOG.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v: numpy.ndarray) -> float: return self._distance_function(d_v, d2_v) with self._model_lock: LOG.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) LOG.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids: List[Hashable] = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) LOG.debug("-- matched %d UUIDs", len(neighbor_uuids)) LOG.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_set.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. LOG.debug(f"ordering descriptors via distance method {self.distance_method}") LOG.debug('-- getting element vectors') neighbor_vectors = numpy.asarray(list( parallel_map(lambda d_: d_.vector(), neighbors) )) LOG.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) LOG.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) LOG.debug(f'-- slicing top n={n}') r_descrs: Tuple[DescriptorElement, ...] r_dists: Tuple[float, ...] r_descrs, r_dists = zip(*(ordered[:n])) return r_descrs, r_dists