Ejemplo n.º 1
0
    def neighbors(self, k=10, queue_size=5, random_state=0):
        """\
        Calculate neighbors of `adata_new` observations in `adata`.

        This function calculates `k` neighbors in `adata` for
        each observation of `adata_new`.
        """
        from umap.nndescent import initialise_search
        from umap.utils import deheap_sort
        from umap.umap_ import INT32_MAX, INT32_MIN

        random_state = check_random_state(random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

        train = self._rep
        test = self._obsm['rep']

        init = initialise_search(
            self._rp_forest,
            train,
            test,
            int(k * queue_size),
            self._random_init,
            self._tree_init,
            rng_state,
        )

        result = self._search(
            train, self._search_graph.indptr, self._search_graph.indices, init, test,
        )
        indices, dists = deheap_sort(result)
        self._indices, self._distances = indices[:, :k], dists[:, :k]
Ejemplo n.º 2
0
def test_nn_search(nn_data):
    train = nn_data[100:]
    test = nn_data[:100]

    (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
        train,
        10,
        "euclidean",
        {},
        False,
        np.random,
        use_pynndescent=False,
    )
    # Commented - NOT REALLY USED IN THE TEST
    # graph = fuzzy_simplicial_set(
    #     nn_data,
    #     10,
    #     np.random,
    #     "euclidean",
    #     {},
    #     knn_indices,
    #     knn_dists,
    #     False,
    #     1.0,
    #     1.0,
    #     False,
    # )

    search_graph = setup_search_graph(knn_dists, knn_indices, train)
    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    init = initialise_search(rp_forest, train, test, int(10 * 3), rng_state,
                             dist.euclidean)
    result = initialized_nnd_search(train, search_graph.indptr,
                                    search_graph.indices, init, test,
                                    dist.euclidean)

    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train)
    true_indices = tree.query(test, 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.99,
        "Sparse NN-descent did not get "
        "99% accuracy on nearest "
        "neighbors",
    )
Ejemplo n.º 3
0
def test_nn_search():
    train = nn_data[100:]
    test = nn_data[:100]
    (knn_indices, knn_dists,
     rp_forest) = nearest_neighbors(train, 10, "euclidean", {}, False,
                                    np.random)

    graph = fuzzy_simplicial_set(
        nn_data,
        10,
        np.random,
        "euclidean",
        {},
        knn_indices,
        knn_dists,
        False,
        1.0,
        1.0,
        False,
    )

    search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]),
                                     dtype=np.int8)
    search_graph.rows = knn_indices
    search_graph.data = (knn_dists != 0).astype(np.int8)
    search_graph = search_graph.maximum(search_graph.transpose()).tocsr()

    random_init, tree_init = make_initialisations(dist.euclidean, ())
    search = make_initialized_nnd_search(dist.euclidean, ())

    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    init = initialise_search(rp_forest, train, test, int(10 * 3), random_init,
                             tree_init, rng_state)
    result = search(train, search_graph.indptr, search_graph.indices, init,
                    test)

    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train)
    true_indices = tree.query(test, 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.99,
        "Sparse NN-descent did not get "
        "99% accuracy on nearest "
        "neighbors",
    )
Ejemplo n.º 4
0
def _nhood_search(umap_object, nhood_size):
    if umap_object._small_data:
        dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data)
        indices = np.argpartition(dmat, nhood_size)[:, :nhood_size]
        dmat_shortened = submatrix(dmat, indices, nhood_size)
        indices_sorted = np.argsort(dmat_shortened)
        indices = submatrix(indices, indices_sorted, nhood_size)
        dists = submatrix(dmat_shortened, indices_sorted, nhood_size)
    else:
        rng_state = np.empty(3, dtype=np.int64)

        if len(umap_object._metric_kwds) >= 1:
            _dist = umap_object._input_distance_func
            _args = tuple(umap_object._metric_kwds.values())

            @numba.njit()
            def _metric(x, y):
                _dist(x, y, *_args)

        else:
            _metric = umap_object._input_distance_func

        init = initialise_search(
            umap_object._rp_forest,
            umap_object._raw_data,
            umap_object._raw_data,
            int(nhood_size * umap_object.transform_queue_size),
            rng_state,
            _metric,
        )

        result = initialized_nnd_search(
            umap_object._raw_data,
            umap_object._search_graph.indptr,
            umap_object._search_graph.indices,
            init,
            umap_object._raw_data,
            _metric,
        )

        indices, dists = deheap_sort(result)
        indices = indices[:, :nhood_size]
        dists = dists[:, :nhood_size]

    return indices, dists
Ejemplo n.º 5
0
def neighbors_update(adata, adata_new, k=10, queue_size=5, random_state=0):
    # only with use_rep='X' for now
    from umap.nndescent import make_initialisations, make_initialized_nnd_search, initialise_search
    from umap.umap_ import INT32_MAX, INT32_MIN
    from umap.utils import deheap_sort
    import umap.distances as dist

    if 'metric_kwds' in adata.uns['neighbors']['params']:
        dist_args = tuple(
            adata.uns['neighbors']['params']['metric_kwds'].values())
    else:
        dist_args = ()
    dist_func = dist.named_distances[adata.uns['neighbors']['params']
                                     ['metric']]

    random_init, tree_init = make_initialisations(dist_func, dist_args)
    search = make_initialized_nnd_search(dist_func, dist_args)

    search_graph = adata.uns['neighbors']['distances'].copy()
    search_graph.data = (search_graph.data > 0).astype(np.int8)
    search_graph = search_graph.maximum(search_graph.transpose())
    # prune it?

    random_state = check_random_state(random_state)
    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    if 'rp_forest' in adata.uns['neighbors']:
        rp_forest = _rp_forest_generate(adata.uns['neighbors']['rp_forest'])
    else:
        rp_forest = None
    train = adata.X
    test = adata_new.X

    init = initialise_search(rp_forest, train, test, int(k * queue_size),
                             random_init, tree_init, rng_state)
    result = search(train, search_graph.indptr, search_graph.indices, init,
                    test)

    indices, dists = deheap_sort(result)
    return indices[:, :k], dists[:, :k]
Ejemplo n.º 6
0
def _nhood_search(umap_object, nhood_size):
    if umap_object._small_data:
        dmat = sklearn.metrics.pairwise_distances(umap_object._raw_data)
        indices = np.argpartition(dmat, nhood_size)[:, :nhood_size]
        dmat_shortened = submatrix(dmat, indices, nhood_size)
        indices_sorted = np.argsort(dmat_shortened)
        indices = submatrix(indices, indices_sorted, nhood_size)
        dists = submatrix(dmat_shortened, indices_sorted, nhood_size)
    else:
        rng_state = np.empty(3, dtype=np.int64)

        init = initialise_search(
            umap_object._rp_forest,
            umap_object._raw_data,
            umap_object._raw_data,
            int(nhood_size * umap_object.transform_queue_size),
            rng_state,
            umap_object._distance_func,
            umap_object._dist_args,
        )

        result = initialized_nnd_search(
            umap_object._raw_data,
            umap_object._search_graph.indptr,
            umap_object._search_graph.indices,
            init,
            umap_object._raw_data,
            umap_object._distance_func,
            umap_object._dist_args,
        )

        indices, dists = deheap_sort(result)
        indices = indices[:, :nhood_size]
        dists = dists[:, :nhood_size]

    return indices, dists