Esempio n. 1
0
    def _nn(self,
            d: DescriptorElement,
            n: int = 1
            ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        with self._model_lock:
            self._restore_index()
            assert self._flann is not None, (
                "We should have an index after restoration.")

            vec = d.vector()

            # If the distance method is HIK, we need to treat it special since
            # that method produces a similarity score, not a distance score.
            #
            # FLANN asserts that we query for <= index size, thus the use of
            # min().
            idxs: numpy.ndarray
            dists: numpy.ndarray
            if self._distance_method == 'hik':
                # This call is different than the else version in that k is the
                # size of the full data set, so that we can reverse the
                # distances.
                idxs, dists = self._flann.nn_index(vec, len(self._descr_cache),
                                                   **self._flann_build_params)
            else:
                idxs, dists = self._flann.nn_index(
                    vec, min(n, len(self._descr_cache)),
                    **self._flann_build_params)

            # When N>1, return value is a 2D array. Since this method limits
            # query to a single descriptor, we reduce to 1D arrays.
            if len(idxs.shape) > 1:
                idxs = idxs[0]
                dists = dists[0]

            if self._distance_method == 'hik':
                # Invert values to stay consistent with other distance value
                # norms. This also means that we reverse the "nearest" order
                # and reintroduce `n` size limit.
                # - This is intentionally happening *after* the "squeeze" op
                #   above.
                dists = (1.0 - dists)[::-1][:n]
                idxs = idxs[::-1][:n]

            return tuple(self._descr_cache[i] for i in idxs), tuple(dists)
Esempio n. 2
0
    def _nn(self,
            d: DescriptorElement,
            n: int = 1
            ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        # Parent template method already assures there is a vector stored in
        # the input.
        d_vector = d.vector()
        assert d_vector is not None
        # Reshape into a 1xD vector with float32 type, which is required for
        # use with FAISS search.
        q = d_vector[np.newaxis, :].astype(np.float32)
        LOG.debug("Received query for %d nearest neighbors", n)

        with self._model_lock:
            if self._faiss_index is None:
                raise RuntimeError("No index currently available to remove "
                                   "from.")

            # Attempt to set n-probe of an IVF index
            self._set_index_nprobe()

            s_dists: np.ndarray
            s_ids: np.ndarray
            s_dists, s_ids = self._faiss_index.search(
                q, k=min(n, self._faiss_index.ntotal))
            s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :]
            # Convert numpy.int64 type values into python integer values.
            # This is for compatibility when comparing values in some KVS
            # impls (postgres...).
            s_ids = s_ids.astype(object)
            # s_id (the FAISS index indices) can equal -1 if fewer than the
            # requested number of nearest neighbors is returned. In this case,
            # eliminate the -1 entries
            LOG.debug("Getting descriptor UIDs from idx2uid mapping.")
            uuids = list(
                self._idx2uid_kvs.get_many(
                    cast(Iterator[Hashable],
                         filter(lambda s_id_: s_id_ >= 0, s_ids))))
            if len(uuids) < n:
                warnings.warn(
                    f"Less than n={n} neighbors were retrieved from "
                    "the FAISS index instance. Maybe increase "
                    "nprobe if this is an IVF index?", RuntimeWarning)

            descriptors = tuple(
                self._descriptor_set.get_many_descriptors(uuids))

        LOG.debug("Min and max FAISS distances: %g, %g", min(s_dists),
                  max(s_dists))

        d_vectors = np.vstack(DescriptorElement.get_many_vectors(descriptors))
        d_dists = metrics.euclidean_distance(d_vectors, q)

        LOG.debug("Min and max descriptor distances: %g, %g", min(d_dists),
                  max(d_dists))

        order = d_dists.argsort()
        uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order))

        LOG.debug("Returning query result of size %g", len(uuids))

        return descriptors, tuple(d_dists)
Esempio n. 3
0
    def _nn(
        self,
        d: DescriptorElement,
        n: int = 1
    ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        LOG.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v: numpy.ndarray) -> float:
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            LOG.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            LOG.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids: List[Hashable] = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            LOG.debug("-- matched %d UUIDs", len(neighbor_uuids))

            LOG.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_set.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        LOG.debug(f"ordering descriptors via distance method {self.distance_method}")
        LOG.debug('-- getting element vectors')
        neighbor_vectors = numpy.asarray(list(
            parallel_map(lambda d_: d_.vector(), neighbors)
        ))
        LOG.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        LOG.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        LOG.debug(f'-- slicing top n={n}')
        r_descrs: Tuple[DescriptorElement, ...]
        r_dists: Tuple[float, ...]
        r_descrs, r_dists = zip(*(ordered[:n]))
        return r_descrs, r_dists
Esempio n. 4
0
    def _nn(self,
            d: DescriptorElement,
            n: int = 1
            ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        # Parent template method already checks that `d` has a non-None vector
        d_v = d.vector()

        def _query_single(tree: TreeElement) -> List[Hashable]:
            # Search a single tree for the leaf that matches the query
            # NB: random_basis has shape (levels, N)
            random_basis = tree.random_basis
            assert d_v is not None
            proj_query = d_v.dot(random_basis)
            splits = tree.splits
            idx = 0
            for level in range(depth):
                split_point = splits[idx]
                # Look at the level'th coordinate of proj_query
                if proj_query[level] < split_point:
                    idx = 2 * idx + 1
                else:
                    idx = 2 * idx + 2

            # idx will be `2^depth - 1` greater than the position of the leaf
            # in the list
            idx -= ((1 << depth) - 1)
            return tree.leaves[idx]

        def _exact_query(
            _uuids: Sequence[Hashable]
        ) -> Tuple[Sequence[Hashable], np.ndarray]:
            set_size = len(_uuids)
            LOG.debug(f"Exact query requested with {set_size} descriptors")

            # Assemble the array to query from the descriptors that match
            assert d_v is not None
            pts_array = np.empty((set_size, d_v.size), dtype=d_v.dtype)
            descriptors = self._descriptor_set.get_many_descriptors(_uuids)
            for i, desc in enumerate(descriptors):
                pts_array[i, :] = desc.vector()

            dists: np.ndarray = ((pts_array - d_v)**2).sum(axis=1)

            if n > dists.shape[0]:
                LOG.warning(
                    f"There were fewer descriptors ({dists.shape[0]}) in the "
                    f"set than requested in the query ({n}). Returning entire "
                    f"set.")
            if n >= dists.shape[0]:
                return _uuids, dists

            near_indices = np.argpartition(dists, n - 1)[:n]
            return ([_uuids[idx] for idx in near_indices], dists[near_indices])

        with self._model_lock:
            LOG.debug(f"Received query for {n} nearest neighbors")

            depth, ntrees, db_size = self._depth, self._num_trees, self.count()
            leaf_size = db_size // (1 << depth)
            if leaf_size * ntrees < n:
                LOG.warning(
                    f"The number of descriptors in a leaf ({leaf_size}) times "
                    f"the number of trees ({ntrees}) is less than the number "
                    f"of descriptors requested by the query ({n}). The query "
                    f"result will be deficient.")

            # Take union of all tree hits
            tree_hits: Set[Hashable] = set()
            for t in self._trees:
                tree_hits.update(_query_single(t))

            hit_union = len(tree_hits)
            LOG.debug(
                f"Query (k): {n}, Hit union (h): {hit_union}, "
                f"DB (N): {db_size}, Leaf size (L = N/2^l): {leaf_size}, "
                f"Examined (T*L): {leaf_size * ntrees}")
            LOG.debug(f"k/L     = {n / leaf_size:.3f}")
            LOG.debug(f"h/N     = {hit_union / db_size:.3f}")
            LOG.debug(f"h/L     = {hit_union / leaf_size:.3f}")
            LOG.debug(f"h/(T*L) = {hit_union / (leaf_size * ntrees):.3f}")

            uuids, distances = _exact_query(list(tree_hits))
            order = distances.argsort()
            uuids, distances = zip(*((uuids[oidx], distances[oidx])
                                     for oidx in order))

            LOG.debug(f"Returning query result of size {len(uuids)}")

            return (tuple(self._descriptor_set.get_many_descriptors(uuids)),
                    tuple(distances))