Ejemplo n.º 1
0
def get_invlist(invlists, l):
    """ returns the inverted lists content as a pair of (list_ids, list_codes).
    The codes are reshaped to a proper size
    """
    invlists = faiss.downcast_InvertedLists(invlists)
    ls = invlists.list_size(l)
    list_ids = np.zeros(ls, dtype='int64')
    ids = codes = None
    try:
        ids = invlists.get_ids(l)
        if ls > 0:
            faiss.memcpy(faiss.swig_ptr(list_ids), ids, list_ids.nbytes)
        codes = invlists.get_codes(l)
        if invlists.code_size != faiss.InvertedLists.INVALID_CODE_SIZE:
            list_codes = np.zeros((ls, invlists.code_size), dtype='uint8')
        else:
            # it's a BlockInvertedLists
            npb = invlists.n_per_block
            bs = invlists.block_size
            ls_round = (ls + npb - 1) // npb
            list_codes = np.zeros((ls_round, bs // npb, npb), dtype='uint8')
        if ls > 0:
            faiss.memcpy(faiss.swig_ptr(list_codes), codes, list_codes.nbytes)
    finally:
        if ids is not None:
            invlists.release_ids(l, ids)
        if codes is not None:
            invlists.release_codes(l, codes)
    return list_ids, list_codes
Ejemplo n.º 2
0
def run_kmeans(x, nmb_clusters, verbose=False, init_cents=None):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000

    if init_cents is not None:
        clus.centroids.resize(init_cents.size)
        faiss.memcpy(clus.centroids.data(), faiss.swig_ptr(init_cents),
                     init_cents.size * 4)

    index = faiss.IndexFlatL2(d)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    centroids = faiss.vector_to_array(clus.centroids).reshape(
        (nmb_clusters, d))

    return [int(n[0]) for n in I], losses[-1], centroids
Ejemplo n.º 3
0
    def get_cluster_ids(self, list_num: int) -> np.ndarray:
        """
        TODO: docstring

        """

        # TODO: assert IVF
        assert self.is_trained

        # This fixes problem with SWIG and numpy int
        list_num = int(list_num)

        index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME))

        # Get the IVF from potentially opaque index
        invlists = faiss.extract_index_ivf(index).invlists
        list_size = invlists.list_size(list_num)
        list_ids = np.zeros(list_size, dtype=np.int64)
        temp_ids = invlists.get_ids(list_num)

        # Need to copy since memory will be deallocated along with the invlist.
        faiss.memcpy(faiss.swig_ptr(list_ids), temp_ids, list_ids.nbytes)
        invlists.release_ids(list_num, temp_ids)

        if self.multi_id:
            list_ids = self._invert_cantor_pairing_vec(list_ids)

        return list_ids
Ejemplo n.º 4
0
def get_invlist(invlists, l):
    """ returns the inverted lists content. """
    ls = invlists.list_size(l)
    list_ids = np.zeros(ls, dtype='int64')
    ids = codes = None
    try:
        ids = invlists.get_ids(l)
        faiss.memcpy(faiss.swig_ptr(list_ids), ids, list_ids.nbytes)
        codes = invlists.get_codes(l)
        list_codes = np.zeros((ls, invlists.code_size), dtype='uint8')
        faiss.memcpy(faiss.swig_ptr(list_codes), codes, list_codes.nbytes)
    finally:
        if ids is not None:
            invlists.release_ids(l, ids)
        if codes is not None:
            invlists.release_codes(l, codes)
    return list_ids, list_codes