def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ with self._model_lock: self._restore_index() assert self._flann is not None, ( "We should have an index after restoration.") vec = d.vector() # If the distance method is HIK, we need to treat it special since # that method produces a similarity score, not a distance score. # # FLANN asserts that we query for <= index size, thus the use of # min(). idxs: numpy.ndarray dists: numpy.ndarray if self._distance_method == 'hik': # This call is different than the else version in that k is the # size of the full data set, so that we can reverse the # distances. idxs, dists = self._flann.nn_index(vec, len(self._descr_cache), **self._flann_build_params) else: idxs, dists = self._flann.nn_index( vec, min(n, len(self._descr_cache)), **self._flann_build_params) # When N>1, return value is a 2D array. Since this method limits # query to a single descriptor, we reduce to 1D arrays. if len(idxs.shape) > 1: idxs = idxs[0] dists = dists[0] if self._distance_method == 'hik': # Invert values to stay consistent with other distance value # norms. This also means that we reverse the "nearest" order # and reintroduce `n` size limit. # - This is intentionally happening *after* the "squeeze" op # above. dists = (1.0 - dists)[::-1][:n] idxs = idxs[::-1][:n] return tuple(self._descr_cache[i] for i in idxs), tuple(dists)
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ # Parent template method already assures there is a vector stored in # the input. d_vector = d.vector() assert d_vector is not None # Reshape into a 1xD vector with float32 type, which is required for # use with FAISS search. q = d_vector[np.newaxis, :].astype(np.float32) LOG.debug("Received query for %d nearest neighbors", n) with self._model_lock: if self._faiss_index is None: raise RuntimeError("No index currently available to remove " "from.") # Attempt to set n-probe of an IVF index self._set_index_nprobe() s_dists: np.ndarray s_ids: np.ndarray s_dists, s_ids = self._faiss_index.search( q, k=min(n, self._faiss_index.ntotal)) s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :] # Convert numpy.int64 type values into python integer values. # This is for compatibility when comparing values in some KVS # impls (postgres...). s_ids = s_ids.astype(object) # s_id (the FAISS index indices) can equal -1 if fewer than the # requested number of nearest neighbors is returned. In this case, # eliminate the -1 entries LOG.debug("Getting descriptor UIDs from idx2uid mapping.") uuids = list( self._idx2uid_kvs.get_many( cast(Iterator[Hashable], filter(lambda s_id_: s_id_ >= 0, s_ids)))) if len(uuids) < n: warnings.warn( f"Less than n={n} neighbors were retrieved from " "the FAISS index instance. Maybe increase " "nprobe if this is an IVF index?", RuntimeWarning) descriptors = tuple( self._descriptor_set.get_many_descriptors(uuids)) LOG.debug("Min and max FAISS distances: %g, %g", min(s_dists), max(s_dists)) d_vectors = np.vstack(DescriptorElement.get_many_vectors(descriptors)) d_dists = metrics.euclidean_distance(d_vectors, q) LOG.debug("Min and max descriptor distances: %g, %g", min(d_dists), max(d_dists)) order = d_dists.argsort() uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order)) LOG.debug("Returning query result of size %g", len(uuids)) return descriptors, tuple(d_dists)
def _nn( self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ LOG.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v: numpy.ndarray) -> float: return self._distance_function(d_v, d2_v) with self._model_lock: LOG.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) LOG.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids: List[Hashable] = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) LOG.debug("-- matched %d UUIDs", len(neighbor_uuids)) LOG.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_set.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. LOG.debug(f"ordering descriptors via distance method {self.distance_method}") LOG.debug('-- getting element vectors') neighbor_vectors = numpy.asarray(list( parallel_map(lambda d_: d_.vector(), neighbors) )) LOG.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) LOG.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) LOG.debug(f'-- slicing top n={n}') r_descrs: Tuple[DescriptorElement, ...] r_dists: Tuple[float, ...] r_descrs, r_dists = zip(*(ordered[:n])) return r_descrs, r_dists
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: # Parent template method already checks that `d` has a non-None vector d_v = d.vector() def _query_single(tree: TreeElement) -> List[Hashable]: # Search a single tree for the leaf that matches the query # NB: random_basis has shape (levels, N) random_basis = tree.random_basis assert d_v is not None proj_query = d_v.dot(random_basis) splits = tree.splits idx = 0 for level in range(depth): split_point = splits[idx] # Look at the level'th coordinate of proj_query if proj_query[level] < split_point: idx = 2 * idx + 1 else: idx = 2 * idx + 2 # idx will be `2^depth - 1` greater than the position of the leaf # in the list idx -= ((1 << depth) - 1) return tree.leaves[idx] def _exact_query( _uuids: Sequence[Hashable] ) -> Tuple[Sequence[Hashable], np.ndarray]: set_size = len(_uuids) LOG.debug(f"Exact query requested with {set_size} descriptors") # Assemble the array to query from the descriptors that match assert d_v is not None pts_array = np.empty((set_size, d_v.size), dtype=d_v.dtype) descriptors = self._descriptor_set.get_many_descriptors(_uuids) for i, desc in enumerate(descriptors): pts_array[i, :] = desc.vector() dists: np.ndarray = ((pts_array - d_v)**2).sum(axis=1) if n > dists.shape[0]: LOG.warning( f"There were fewer descriptors ({dists.shape[0]}) in the " f"set than requested in the query ({n}). Returning entire " f"set.") if n >= dists.shape[0]: return _uuids, dists near_indices = np.argpartition(dists, n - 1)[:n] return ([_uuids[idx] for idx in near_indices], dists[near_indices]) with self._model_lock: LOG.debug(f"Received query for {n} nearest neighbors") depth, ntrees, db_size = self._depth, self._num_trees, self.count() leaf_size = db_size // (1 << depth) if leaf_size * ntrees < n: LOG.warning( f"The number of descriptors in a leaf ({leaf_size}) times " f"the number of trees ({ntrees}) is less than the number " f"of descriptors requested by the query ({n}). The query " f"result will be deficient.") # Take union of all tree hits tree_hits: Set[Hashable] = set() for t in self._trees: tree_hits.update(_query_single(t)) hit_union = len(tree_hits) LOG.debug( f"Query (k): {n}, Hit union (h): {hit_union}, " f"DB (N): {db_size}, Leaf size (L = N/2^l): {leaf_size}, " f"Examined (T*L): {leaf_size * ntrees}") LOG.debug(f"k/L = {n / leaf_size:.3f}") LOG.debug(f"h/N = {hit_union / db_size:.3f}") LOG.debug(f"h/L = {hit_union / leaf_size:.3f}") LOG.debug(f"h/(T*L) = {hit_union / (leaf_size * ntrees):.3f}") uuids, distances = _exact_query(list(tree_hits)) order = distances.argsort() uuids, distances = zip(*((uuids[oidx], distances[oidx]) for oidx in order)) LOG.debug(f"Returning query result of size {len(uuids)}") return (tuple(self._descriptor_set.get_many_descriptors(uuids)), tuple(distances))