Exemple #1
0
    def neighbors(self, k=10, queue_size=5, random_state=0):
        """\
        Calculate neighbors of `adata_new` observations in `adata`.

        This function calculates `k` neighbors in `adata` for
        each observation of `adata_new`.
        """
        from umap.utils import deheap_sort
        from umap.umap_ import INT32_MAX, INT32_MIN

        random_state = check_random_state(random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX,
                                         3).astype(np.int64)

        train = self._rep
        test = self._obsm['rep']

        init = self._initialise_search(
            self._rp_forest,
            train,
            test,
            int(k * queue_size),
            rng_state=rng_state,
        )

        result = self._search(
            train,
            self._search_graph.indptr,
            self._search_graph.indices,
            init,
            test,
        )
        indices, dists = deheap_sort(result)
        self._indices, self._distances = indices[:, :k], dists[:, :k]
Exemple #2
0
    def neighbors(self, k=None, queue_size=5, epsilon=0.1, random_state=0):
        """\
        Calculate neighbors of `adata_new` observations in `adata`.

        This function calculates `k` neighbors in `adata` for
        each observation of `adata_new`.
        """
        from umap.umap_ import INT32_MAX, INT32_MIN

        random_state = check_random_state(random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

        train = self._rep
        test = self._obsm['rep']

        if k is None:
            k = self._n_neighbors

        if self._use_pynndescent:
            self._nnd_idx.search_rng_state = rng_state

            self._indices, self._distances = self._nnd_idx.query(test, k, epsilon)

        else:
            from umap.utils import deheap_sort

            init = self._initialise_search(
                self._rp_forest, train, test, int(k * queue_size), rng_state=rng_state
            )

            result = self._search(
                train, self._search_graph.indptr, self._search_graph.indices, init, test
            )
            indices, dists = deheap_sort(result)
            self._indices, self._distances = indices[:, :k], dists[:, :k]
Exemple #3
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
):
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]])
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data, dist, current_graph, leaf_array, tried=tried)

    if low_memory:
        nn_descent_internal_low_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )
    else:
        nn_descent_internal_high_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )

    return deheap_sort(current_graph)
Exemple #4
0
def test_nn_search(nn_data):
    train = nn_data[100:]
    test = nn_data[:100]

    (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
        train,
        10,
        "euclidean",
        {},
        False,
        np.random,
        use_pynndescent=False,
    )
    # Commented - NOT REALLY USED IN THE TEST
    # graph = fuzzy_simplicial_set(
    #     nn_data,
    #     10,
    #     np.random,
    #     "euclidean",
    #     {},
    #     knn_indices,
    #     knn_dists,
    #     False,
    #     1.0,
    #     1.0,
    #     False,
    # )

    search_graph = setup_search_graph(knn_dists, knn_indices, train)
    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    init = initialise_search(rp_forest, train, test, int(10 * 3), rng_state,
                             dist.euclidean)
    result = initialized_nnd_search(train, search_graph.indptr,
                                    search_graph.indices, init, test,
                                    dist.euclidean)

    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train)
    true_indices = tree.query(test, 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.99,
        "Sparse NN-descent did not get "
        "99% accuracy on nearest "
        "neighbors",
    )
Exemple #5
0
def test_nn_search():
    train = nn_data[100:]
    test = nn_data[:100]
    (knn_indices, knn_dists,
     rp_forest) = nearest_neighbors(train, 10, "euclidean", {}, False,
                                    np.random)

    graph = fuzzy_simplicial_set(
        nn_data,
        10,
        np.random,
        "euclidean",
        {},
        knn_indices,
        knn_dists,
        False,
        1.0,
        1.0,
        False,
    )

    search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]),
                                     dtype=np.int8)
    search_graph.rows = knn_indices
    search_graph.data = (knn_dists != 0).astype(np.int8)
    search_graph = search_graph.maximum(search_graph.transpose()).tocsr()

    random_init, tree_init = make_initialisations(dist.euclidean, ())
    search = make_initialized_nnd_search(dist.euclidean, ())

    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    init = initialise_search(rp_forest, train, test, int(10 * 3), random_init,
                             tree_init, rng_state)
    result = search(train, search_graph.indptr, search_graph.indices, init,
                    test)

    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train)
    true_indices = tree.query(test, 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.99,
        "Sparse NN-descent did not get "
        "99% accuracy on nearest "
        "neighbors",
    )
Exemple #6
0
def _nhood_search(umap_object, nhood_size):
    if umap_object._small_data:
        dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data)
        indices = np.argpartition(dmat, nhood_size)[:, :nhood_size]
        dmat_shortened = submatrix(dmat, indices, nhood_size)
        indices_sorted = np.argsort(dmat_shortened)
        indices = submatrix(indices, indices_sorted, nhood_size)
        dists = submatrix(dmat_shortened, indices_sorted, nhood_size)
    else:
        rng_state = np.empty(3, dtype=np.int64)

        if len(umap_object._metric_kwds) >= 1:
            _dist = umap_object._input_distance_func
            _args = tuple(umap_object._metric_kwds.values())

            @numba.njit()
            def _metric(x, y):
                _dist(x, y, *_args)

        else:
            _metric = umap_object._input_distance_func

        init = initialise_search(
            umap_object._rp_forest,
            umap_object._raw_data,
            umap_object._raw_data,
            int(nhood_size * umap_object.transform_queue_size),
            rng_state,
            _metric,
        )

        result = initialized_nnd_search(
            umap_object._raw_data,
            umap_object._search_graph.indptr,
            umap_object._search_graph.indices,
            init,
            umap_object._raw_data,
            _metric,
        )

        indices, dists = deheap_sort(result)
        indices = indices[:, :nhood_size]
        dists = dists[:, :nhood_size]

    return indices, dists
Exemple #7
0
def neighbors_update(adata, adata_new, k=10, queue_size=5, random_state=0):
    # only with use_rep='X' for now
    from umap.nndescent import make_initialisations, make_initialized_nnd_search, initialise_search
    from umap.umap_ import INT32_MAX, INT32_MIN
    from umap.utils import deheap_sort
    import umap.distances as dist

    if 'metric_kwds' in adata.uns['neighbors']['params']:
        dist_args = tuple(
            adata.uns['neighbors']['params']['metric_kwds'].values())
    else:
        dist_args = ()
    dist_func = dist.named_distances[adata.uns['neighbors']['params']
                                     ['metric']]

    random_init, tree_init = make_initialisations(dist_func, dist_args)
    search = make_initialized_nnd_search(dist_func, dist_args)

    search_graph = adata.uns['neighbors']['distances'].copy()
    search_graph.data = (search_graph.data > 0).astype(np.int8)
    search_graph = search_graph.maximum(search_graph.transpose())
    # prune it?

    random_state = check_random_state(random_state)
    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    if 'rp_forest' in adata.uns['neighbors']:
        rp_forest = _rp_forest_generate(adata.uns['neighbors']['rp_forest'])
    else:
        rp_forest = None
    train = adata.X
    test = adata_new.X

    init = initialise_search(rp_forest, train, test, int(k * queue_size),
                             random_init, tree_init, rng_state)
    result = search(train, search_graph.indptr, search_graph.indices, init,
                    test)

    indices, dists = deheap_sort(result)
    return indices[:, :k], dists[:, :k]
Exemple #8
0
def _nhood_search(umap_object, nhood_size):
    if umap_object._small_data:
        dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data)
        indices = np.argpartition(dmat, nhood_size)[:, :nhood_size]
        dmat_shortened = submatrix(dmat, indices, nhood_size)
        indices_sorted = np.argsort(dmat_shortened)
        indices = submatrix(indices, indices_sorted, nhood_size)
        dists = submatrix(dmat_shortened, indices_sorted, nhood_size)
    else:
        rng_state = np.empty(3, dtype=np.int64)

        init = initialise_search(
            umap_object._rp_forest,
            umap_object._raw_data,
            umap_object._raw_data,
            int(nhood_size * umap_object.transform_queue_size),
            rng_state,
            umap_object._distance_func,
            umap_object._dist_args,
        )

        result = initialized_nnd_search(
            umap_object._raw_data,
            umap_object._search_graph.indptr,
            umap_object._search_graph.indices,
            init,
            umap_object._raw_data,
            umap_object._distance_func,
            umap_object._dist_args,
        )

        indices, dists = deheap_sort(result)
        indices = indices[:, :nhood_size]
        dists = dists[:, :nhood_size]

    return indices, dists
Exemple #9
0
def test_sparse_nn_search(sparse_nn_data):
    train = sparse_nn_data[100:]
    test = sparse_nn_data[:100]
    (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
        train,
        15,
        "euclidean",
        {},
        False,
        np.random,
        use_pynndescent=False,
    )

    # COMMENTED OUT as NOT REALLY INFLUENCING THE TEST
    # NOTE: there is a use of nn_data here rather than spatial_nn_data
    # looks like a copy&paste error, not very intended.
    # graph = fuzzy_simplicial_set(
    #     nn_data,
    #     15,
    #     np.random,
    #     "euclidean",
    #     {},
    #     knn_indices,
    #     knn_dists,
    #     False,
    #     1.0,
    #     1.0,
    #     False,
    # )

    search_graph = setup_search_graph(knn_dists, knn_indices, train)
    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    init = sparse_initialise_search(
        rp_forest,
        train.indices,
        train.indptr,
        train.data,
        test.indices,
        test.indptr,
        test.data,
        int(10 * 6),
        rng_state,
        spdist.sparse_euclidean,
    )

    result = sparse_initialized_nnd_search(
        train.indices,
        train.indptr,
        train.data,
        search_graph.indptr,
        search_graph.indices,
        init,
        test.indices,
        test.indptr,
        test.data,
        spdist.sparse_euclidean,
    )
    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train.toarray())
    true_indices = tree.query(test.toarray(), 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.85,
        "Sparse NN-descent did not get "
        "85% accuracy on nearest "
        "neighbors",
    )
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=umap.sparse.sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rho=0.5,
    low_memory=False,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            current_graph,
            leaf_array,
            tried=tried,
        )

    if low_memory:
        sparse_nn_descent_internal_low_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )
    else:
        sparse_nn_descent_internal_high_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )

    return deheap_sort(current_graph)
Exemple #11
0
    def nn_descent(
        data,
        n_neighbors,
        rng_state,
        max_candidates=50,
        n_iters=10,
        delta=0.001,
        rho=0.5,
        rp_tree_init=True,
        leaf_array=None,
        verbose=False,
    ):
        n_vertices = data.shape[0]

        current_graph = make_heap(data.shape[0], n_neighbors)
        for i in range(data.shape[0]):
            indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
            for j in range(indices.shape[0]):
                d = dist(data[i], data[indices[j]], *dist_args)
                heap_push(current_graph, i, d, indices[j], 1)
                heap_push(current_graph, indices[j], d, i, 1)

        if rp_tree_init:
            for n in range(leaf_array.shape[0]):
                for i in range(leaf_array.shape[1]):
                    if leaf_array[n, i] < 0:
                        break
                    for j in range(i + 1, leaf_array.shape[1]):
                        if leaf_array[n, j] < 0:
                            break
                        d = dist(
                            data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args
                        )
                        heap_push(
                            current_graph, leaf_array[n, i], d, leaf_array[n, j], 1
                        )
                        heap_push(
                            current_graph, leaf_array[n, j], d, leaf_array[n, i], 1
                        )

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            candidate_neighbors = build_candidates(
                current_graph, n_vertices, n_neighbors, max_candidates, rng_state
            )

            c = 0
            for i in range(n_vertices):
                for j in range(max_candidates):
                    p = int(candidate_neighbors[0, i, j])
                    if p < 0 or tau_rand(rng_state) < rho:
                        continue
                    for k in range(max_candidates):
                        q = int(candidate_neighbors[0, i, k])
                        if (
                            q < 0
                            or not candidate_neighbors[2, i, j]
                            and not candidate_neighbors[2, i, k]
                        ):
                            continue

                        d = dist(data[p], data[q], *dist_args)
                        c += heap_push(current_graph, p, d, q, 1)
                        c += heap_push(current_graph, q, d, p, 1)

            if c <= delta * n_neighbors * data.shape[0]:
                break

        return deheap_sort(current_graph)
Exemple #12
0
    def nn_descent(inds,
                   indptr,
                   data,
                   n_vertices,
                   n_neighbors,
                   rng_state,
                   max_candidates=50,
                   n_iters=10,
                   delta=0.001,
                   rho=0.5,
                   rp_tree_init=True,
                   leaf_array=None,
                   verbose=False):

        current_graph = make_heap(n_vertices, n_neighbors)
        for i in range(n_vertices):
            indices = rejection_sample(n_neighbors, n_vertices, rng_state)
            for j in range(indices.shape[0]):

                from_inds = inds[indptr[i]:indptr[i + 1]]
                from_data = data[indptr[i]:indptr[i + 1]]

                to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
                to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

                d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                *dist_args)

                heap_push(current_graph, i, d, indices[j], 1)
                heap_push(current_graph, indices[j], d, i, 1)

        if rp_tree_init:
            for n in range(leaf_array.shape[0]):
                for i in range(leaf_array.shape[1]):
                    if leaf_array[n, i] < 0:
                        break
                    for j in range(i + 1, leaf_array.shape[1]):
                        if leaf_array[n, j] < 0:
                            break

                        from_inds = inds[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]
                        from_data = data[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]

                        to_inds = inds[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]
                        to_data = data[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        heap_push(current_graph, leaf_array[n, i], d,
                                  leaf_array[n, j], 1)
                        heap_push(current_graph, leaf_array[n, j], d,
                                  leaf_array[n, i], 1)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            candidate_neighbors = build_candidates(current_graph, n_vertices,
                                                   n_neighbors, max_candidates,
                                                   rng_state)

            c = 0
            for i in range(n_vertices):
                for j in range(max_candidates):
                    p = int(candidate_neighbors[0, i, j])
                    if p < 0 or tau_rand(rng_state) < rho:
                        continue
                    for k in range(max_candidates):
                        q = int(candidate_neighbors[0, i, k])
                        if q < 0 or not candidate_neighbors[2, i, j] and not \
                                candidate_neighbors[2, i, k]:
                            continue

                        from_inds = inds[indptr[p]:indptr[p + 1]]
                        from_data = data[indptr[p]:indptr[p + 1]]

                        to_inds = inds[indptr[q]:indptr[q + 1]]
                        to_data = data[indptr[q]:indptr[q + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        c += heap_push(current_graph, p, d, q, 1)
                        c += heap_push(current_graph, q, d, p, 1)

            if c <= delta * n_neighbors * n_vertices:
                break

        return deheap_sort(current_graph)