Ejemplo n.º 1
0
def fit_mlce(X, n_neighbors, metric, n_components=2):
    """
    Maximal Linkage Cross Entropy

    Build the fuzzy simplices Laplacian Eigenmaps-style (via maximal linkage clustering and
        inverse -log fuzzy simplex weights) and then fit the matrix UMAP stype (via fuzzy cross entropy)
    """
    dense_graph = get_adjacency_matrix(X=X,
                                       n_neighbors=n_neighbors,
                                       metric=metric)
    graph = coo_matrix(dense_graph)

    a, b = find_ab_params(spread=1.0, min_dist=0.1)
    return simplicial_set_embedding(
        data=X,
        graph=graph,
        n_components=n_components,
        initial_alpha=1.0,
        a=a,
        b=b,
        gamma=1.0,
        negative_sample_rate=5,
        n_epochs=0,
        init=SPECTRAL_INIT,
        random_state=check_random_state(0),
        metric=metric,
        metric_kwds={},
        output_metric=dist.named_distances_with_gradients[EUCLIDEAN],
        output_metric_kwds={},
        euclidean_output=(metric == EUCLIDEAN),
        parallel=False,
        verbose=False,
    )
Ejemplo n.º 2
0
def umap_embed_inplace(ann_heat,
                       n_components=2,
                       min_dist=0.5,
                       spread=1.0,
                       n_epochs=0,
                       init_coords='spectral',
                       alpha=1.0,
                       gamma=1.0,
                       negative_sample_rate=5,
                       random_state=0):
    """
    Calculate UMAP embedding, with direct control over embedding parameters that control the aesthetics.
    """
    random_state = check_random_state(random_state)
    a, b = find_ab_params(spread, min_dist)
    X_umap = simplicial_set_embedding(
        ann_heat.X,
        ann_heat.uns['neighbors']['connectivities'].tocoo(),
        n_components,
        alpha,
        a,
        b,
        gamma,
        negative_sample_rate,
        n_epochs,
        init_coords,
        random_state,
        'euclidean', {},
        verbose=0)
Ejemplo n.º 3
0
def fit_umap(X, n_neighbors, metric, n_components=2):
    sparse_graph, sigmas, rhos = fuzzy_simplicial_set(
        X=X,
        random_state=check_random_state(0),
        n_neighbors=n_neighbors,
        metric=metric)

    a, b = find_ab_params(spread=1.0, min_dist=0.1)
    return simplicial_set_embedding(
        data=X,
        graph=sparse_graph,
        n_components=n_components,
        initial_alpha=1.0,
        a=a,
        b=b,
        gamma=1.0,
        negative_sample_rate=5,
        n_epochs=0,
        init=SPECTRAL_INIT,
        random_state=check_random_state(0),
        metric=metric,
        metric_kwds={},
        output_metric=dist.named_distances_with_gradients[EUCLIDEAN],
        output_metric_kwds={},
        euclidean_output=(metric == EUCLIDEAN),
        parallel=False,
        verbose=False,
    )
Ejemplo n.º 4
0
        def simplicial_set_embedding(*args, **kwargs):
            from umap.umap_ import simplicial_set_embedding

            X_umap, _ = simplicial_set_embedding(
                *args,
                densmap=False,
                densmap_kwds={},
                output_dens=False,
                **kwargs,
            )
            return X_umap
Ejemplo n.º 5
0
    def run_umap_hnsw(
        self,
        X_input,
        graph,
        n_components=2,
        alpha: float = 1.0,
        negative_sample_rate: int = 5,
        gamma: float = 1.0,
        spread=1.0,
        min_dist=0.1,
        init_pos='spectral',
        random_state=1,
    ):

        from umap.umap_ import find_ab_params, simplicial_set_embedding
        import matplotlib.pyplot as plt

        a, b = find_ab_params(spread, min_dist)
        print('a,b, spread, dist', a, b, spread, min_dist)
        t0 = time.time()
        X_umap = simplicial_set_embedding(
            data=X_input,
            graph=graph,
            n_components=n_components,
            initial_alpha=alpha,
            a=a,
            b=b,
            n_epochs=0,
            metric_kwds={},
            gamma=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=np.random.RandomState(random_state),
            metric='euclidean',
            verbose=1)
        return X_umap
Ejemplo n.º 6
0
def umap(
    adata,
    min_dist=0.5,
    spread=1.0,
    n_components=2,
    maxiter=None,
    alpha=1.0,
    gamma=1.0,
    negative_sample_rate=5,
    init_pos='spectral',
    random_state=0,
    a=None,
    b=None,
    copy=False,
):
    """Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    min_dist : `float`, optional (default: 0.5)
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread : `float` (optional, default 1.0)
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components : `int`, optional (default: 2)
        The number of dimensions of the embedding.
    maxiter : `int`, optional (default: `None`)
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha : `float`, optional (default: 1.0)
        The initial learning rate for the embedding optimization.
    gamma : `float` (optional, default 1.0)
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate : `int` (optional, default 5)
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos : `string` or `np.array`, optional (default: 'spectral')
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP.
        Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state : `int`, `RandomState` or `None`, optional (default: 0)
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy : `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata
    if 'neighbors' not in adata.uns:
        raise ValueError(
            'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')
    if ('params' not in adata.uns['neighbors']
            or adata.uns['neighbors']['params']['method'] != 'umap'):
        logg.warning(
            'neighbors/connectivities have not been computed using umap')
    from umap.umap_ import find_ab_params, simplicial_set_embedding
    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b

    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(adata, random_state=random_state)
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords,
                                  dtype=np.float32,
                                  accept_sparse=False)

    random_state = check_random_state(random_state)
    n_epochs = 0 if maxiter is None else maxiter
    neigh_params = adata.uns['neighbors']['params']
    X = _choose_representation(adata,
                               neigh_params.get('use_rep', None),
                               neigh_params.get('n_pcs', None),
                               silent=True)
    # the data matrix X is really only used for determining the number of connected components
    # for the init condition in the UMAP embedding
    X_umap = simplicial_set_embedding(
        X,
        adata.uns['neighbors']['connectivities'].tocoo(),
        n_components,
        alpha,
        a,
        b,
        gamma,
        negative_sample_rate,
        n_epochs,
        init_coords,
        random_state,
        neigh_params.get('metric', 'euclidean'),
        neigh_params.get('metric_kwds', {}),
        verbose=settings.verbosity > 3,
    )
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None
Ejemplo n.º 7
0
def umap_conn_indices_dist_embedding(X,
                                     n_neighbors=15,
                                     n_components=2,
                                     metric="euclidean",
                                     min_dist=0.1,
                                     random_state=0,
                                     verbose=False):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
    X: sparse matrix (`.X`, dtype `float32`)
        expression matrix (n_cell x n_genes)
    n_neighbors: 'int' (optional, default 15)
        The number of nearest neighbors to compute for each sample in ``X``.
    n_components: 'int' (optional, default 2)
        The dimension of the space to embed into.
    metric: 'str' or `callable` (optional, default cosine)
        The metric to use for the computation.
    min_dist: 'float' (optional, default 0.1)
        The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
        embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
        more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
        scale at which embedded points will be spread out.
    random_state: `int`, `RandomState` instance or `None`, optional (default: None)
        If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
        the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
    verbose: `bool` (optional, default False)
        Controls verbosity of logging.

    Returns
    -------
    Returns an updated `adata` with reduced dimension data for spliced counts, projected future transcript counts 'Y_dim' and adjacency matrix when possible.
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import nearest_neighbors, fuzzy_simplicial_set, simplicial_set_embedding, find_ab_params

    import umap.sparse as sparse
    import umap.distances as dist

    from umap.utils import tau_rand_int, deheap_sort
    from umap.rp_tree import rptree_leaf_array, make_forest
    # https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/nndescent.py
    from umap.nndescent import (
        make_nn_descent,
        make_initialisations,
        make_initialized_nnd_search,
        initialise_search,
    )
    from umap.spectral import spectral_layout

    random_state = check_random_state(42)

    _raw_data = X

    if X.shape[0] < 4096:  #1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(X=dmat,
                                     n_neighbors=n_neighbors,
                                     random_state=random_state,
                                     metric="precomputed",
                                     verbose=verbose)
        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = extract_indices_dist_from_graph(
            g_tmp, n_neighbors=n_neighbors)

    else:
        # Standard case
        (knn_indices, knn_dists,
         rp_forest) = nearest_neighbors(X=X,
                                        n_neighbors=n_neighbors,
                                        metric=metric,
                                        metric_kwds={},
                                        angular=False,
                                        random_state=random_state,
                                        verbose=verbose)

        graph = fuzzy_simplicial_set(X=X,
                                     n_neighbors=n_neighbors,
                                     random_state=random_state,
                                     metric=metric,
                                     knn_indices=knn_indices,
                                     knn_dists=knn_dists,
                                     angular=rp_forest,
                                     verbose=verbose)

        _raw_data = X
        _transform_available = True
        _search_graph = scipy.sparse.lil_matrix((X.shape[0], X.shape[0]),
                                                dtype=np.int8)
        _search_graph.rows = knn_indices  # An array (self.rows) of rows, each of which is a sorted list of column indices of non-zero elements.
        _search_graph.data = (knn_dists != 0).astype(
            np.int8
        )  # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(1, min_dist)

    embedding_ = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=1.0,  # learning_rate
        a=a,
        b=b,
        gamma=1.0,
        negative_sample_rate=5,
        n_epochs=0,
        init="spectral",
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose)

    return graph, knn_indices, knn_dists, embedding_
Ejemplo n.º 8
0
def umap_conn_indices_dist_embedding(
    X,
    n_neighbors=30,
    n_components=2,
    metric="euclidean",
    min_dist=0.1,
    spread=1.0,
    n_epochs=0,
    alpha=1.0,
    gamma=1.0,
    negative_sample_rate=5,
    init_pos="spectral",
    random_state=0,
    densmap=False,
    dens_lambda=2.0,
    dens_frac=0.3,
    dens_var_shift=0.1,
    output_dens=False,
    return_mapper=True,
    verbose=False,
    **umap_kwargs,
):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance matrix and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
        X: sparse matrix (`.X`, dtype `float32`)
            expression matrix (n_cell x n_genes)
        n_neighbors: 'int' (optional, default 15)
            The number of nearest neighbors to compute for each sample in ``X``.
        n_components: 'int' (optional, default 2)
            The dimension of the space to embed into.
        metric: 'str' or `callable` (optional, default `cosine`)
            The metric to use for the computation.
        min_dist: 'float' (optional, default `0.1`)
            The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
            embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
            more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
            scale at which embedded points will be spread out.
        spread: `float` (optional, default 1.0)
            The effective scale of embedded points. In combination with min_dist this determines how clustered/clumped the
            embedded points are.
        n_epochs: 'int' (optional, default 0)
            The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in
            more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset
            (200 for large datasets, 500 for small).
        alpha: `float` (optional, default 1.0)
            Initial learning rate for the SGD.
        gamma: `float` (optional, default 1.0)
            Weight to apply to negative samples. Values higher than one will result in greater weight being given to
            negative samples.
        negative_sample_rate: `float` (optional, default 5)
            The number of negative samples to select per positive sample in the optimization process. Increasing this value
             will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
             The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low
             dimensional embedding.
        init_pos: 'spectral':
            How to initialize the low dimensional embedding. Use a spectral embedding of the fuzzy 1-skeleton
        random_state: `int`, `RandomState` instance or `None`, optional (default: None)
            If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
            the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
        dens_lambda: float (optional, default 2.0)
            Controls the regularization weight of the density correlation term
            in densMAP. Higher values prioritize density preservation over the
            UMAP objective, and vice versa for values closer to zero. Setting this
            parameter to zero is equivalent to running the original UMAP algorithm.
        dens_frac: float (optional, default 0.3)
            Controls the fraction of epochs (between 0 and 1) where the
            density-augmented objective is used in densMAP. The first
            (1 - dens_frac) fraction of epochs optimize the original UMAP objective
            before introducing the density correlation term.
        dens_var_shift: float (optional, default 0.1)
            A small constant added to the variance of local radii in the
            embedding when calculating the density correlation objective to
            prevent numerical instability from dividing by a small number
        output_dens: float (optional, default False)
            Determines whether the local radii of the final embedding (an inverse
            measure of local density) are computed and returned in addition to
            the embedding. If set to True, local radii of the original data
            are also included in the output for comparison; the output is a tuple
            (embedding, original local radii, embedding local radii). This option
            can also be used when densmap=False to calculate the densities for
            UMAP embeddings.
        verbose: `bool` (optional, default False)
                Controls verbosity of logging.

    Returns
    -------
        graph, knn_indices, knn_dists, embedding_
            A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of nearest
            neighbors (knn_dists) and finally the low dimensional embedding (embedding_).
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import (
        nearest_neighbors,
        fuzzy_simplicial_set,
        simplicial_set_embedding,
        find_ab_params,
    )

    random_state = check_random_state(random_state)

    _raw_data = X

    if X.shape[0] < 4096:  # 1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(
            X=dmat,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric="precomputed",
            verbose=verbose,
        )
        if type(graph) == tuple:
            graph = graph[0]

        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = adj_to_knn(g_tmp, n_neighbors=n_neighbors)
    else:
        # Standard case
        (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
            X=X,
            n_neighbors=n_neighbors,
            metric=metric,
            metric_kwds={},
            angular=False,
            random_state=random_state,
            verbose=verbose,
        )

        graph = fuzzy_simplicial_set(
            X=X,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric=metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists,
            angular=rp_forest,
            verbose=verbose,
        )

        _raw_data = X
        _transform_available = True
        # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph, _ = get_conn_dist_graph(knn_indices, knn_dists)
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()
        ).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(spread, min_dist)
    if type(graph) == tuple:
        graph = graph[0]

    dens_lambda = dens_lambda if densmap else 0.0
    dens_frac = dens_frac if densmap else 0.0

    if dens_lambda < 0.0:
        raise ValueError("dens_lambda cannot be negative")
    if dens_frac < 0.0 or dens_frac > 1.0:
        raise ValueError("dens_frac must be between 0.0 and 1.0")
    if dens_var_shift < 0.0:
        raise ValueError("dens_var_shift cannot be negative")

    densmap_kwds = {
        "lambda": dens_lambda,
        "frac": dens_frac,
        "var_shift": dens_var_shift,
        "n_neighbors": n_neighbors,
    }
    embedding_, aux_data = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=alpha,  # learning_rate
        a=a,
        b=b,
        gamma=gamma,
        negative_sample_rate=negative_sample_rate,
        n_epochs=n_epochs,
        init=init_pos,
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose,
        densmap=densmap,
        densmap_kwds=densmap_kwds,
        output_dens=output_dens,
    )

    if return_mapper:
        import umap
        from .utils import update_dict

        if n_epochs == 0:
            n_epochs = None

        _umap_kwargs = {
            "angular_rp_forest": False,
            "local_connectivity": 1.0,
            "metric_kwds": None,
            "set_op_mix_ratio": 1.0,
            "target_metric": "categorical",
            "target_metric_kwds": None,
            "target_n_neighbors": -1,
            "target_weight": 0.5,
            "transform_queue_size": 4.0,
            "transform_seed": 42,
        }
        umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)

        mapper = umap.UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric=metric,
            min_dist=min_dist,
            spread=spread,
            n_epochs=n_epochs,
            learning_rate=alpha,
            repulsion_strength=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=random_state,
            verbose=verbose,
            **umap_kwargs,
        ).fit(X)

        return mapper, graph, knn_indices, knn_dists, embedding_
    else:
        return graph, knn_indices, knn_dists, embedding_
Ejemplo n.º 9
0
def umap(
    adata: AnnData,
    min_dist: float = 0.5,
    spread: float = 1.0,
    n_components: int = 2,
    maxiter: Optional[int] = None,
    alpha: float = 1.0,
    gamma: float = 1.0,
    negative_sample_rate: int = 5,
    init_pos: Union[_InitPos, np.ndarray, None] = 'spectral',
    random_state: AnyRandom = 0,
    a: Optional[float] = None,
    b: Optional[float] = None,
    copy: bool = False,
    method: Literal['umap', 'rapids'] = 'umap',
    neighbors_key: Optional[str] = None,
) -> Optional[AnnData]:
    """\
    Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata
        Annotated data matrix.
    min_dist
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components
        The number of dimensions of the embedding.
    maxiter
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha
        The initial learning rate for the embedding optimization.
    gamma
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP. Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState` or `Generator`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy
        Return a copy instead of writing to adata.
    method
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)
    neighbors_key
        If not specified, umap looks .uns['neighbors'] for neighbors settings
        and .obsp['connectivities'] for connectivities
        (default storage places for pp.neighbors).
        If specified, umap looks .uns[neighbors_key] for neighbors settings and
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata

    if neighbors_key is None:
        neighbors_key = 'neighbors'

    if neighbors_key not in adata.uns:
        raise ValueError(
            f'Did not find .uns["{neighbors_key}"]. Run `sc.pp.neighbors` first.')
    start = logg.info('computing UMAP')

    neighbors = NeighborsView(adata, neighbors_key)

    if ('params' not in neighbors
        or neighbors['params']['method'] != 'umap'):
        logg.warning(f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap')
    from umap.umap_ import find_ab_params, simplicial_set_embedding
    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b
    adata.uns['umap'] = {'params':{'a': a, 'b': b}}
    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(adata, random_state=random_state, neighbors_key=neighbors_key)
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False)

    if random_state != 0:
        adata.uns['umap']['params']['random_state'] = random_state
    random_state = check_random_state(random_state)

    neigh_params = neighbors['params']
    X = _choose_representation(
        adata, neigh_params.get('use_rep', None), neigh_params.get('n_pcs', None), silent=True)
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            neighbors['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP
        n_neighbors = neighbors['params']['n_neighbors']
        n_epochs = 500 if maxiter is None else maxiter # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
            random_state=random_state
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=(
            'added\n'
            "    'X_umap', UMAP coordinates (adata.obsm)"
        ),
    )
    return adata if copy else None
Ejemplo n.º 10
0
     sse_params = {'data':fit1._raw_data,
               'graph':intersection,
               'n_components':np.int32(fit1.n_components),
               'initial_alpha':np.float32(fit1.learning_rate),
               'a':np.float32(fit1._a),
               'b':np.float32(fit1._b),
               'gamma':np.float32(fit1.repulsion_strength),
               'negative_sample_rate':np.int32(fit1.negative_sample_rate),
               'n_epochs':np.int32(200),
               'init':'random',
               'random_state':np.random,
               'metric':fit1.metric,
               'metric_kwds':fit1._metric_kwds,
               'verbose':False}
 
     embedding = umap2.simplicial_set_embedding(**sse_params)
     
     output[component_i][n_neighbors_i][min_dist_i] = embedding
     output[component_i][n_neighbors_i][min_dist_i] = pd.DataFrame(output[component_i][n_neighbors_i][min_dist_i],index=X.index)
     output[component_i][n_neighbors_i][min_dist_i].to_csv(output_path+'/func_norm_'+feats_i+'_components_c'+str(component_i)+'_n'+str(n_neighbors_i)+'_d'+str(min_dist_i)+'.csv',sep=';')
     
     '''
     test_embedding = []
     for elem_i in range(x_test.shape[0]):
     print('Start transform for sample {}: {}'.format(elem_i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
         test_embedding.append([reducer_num.transform(x_test[elem_i].reshape(1, -1)).tolist()])
     print('End transform for sample {}: {}'.format(elem_i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
     
     print('Start transform for test: {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
     test_embedding = reducer_num.transform(x_test)
     print('End transform for test: {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Ejemplo n.º 11
0
    def runUMAP(
        self,
        n_components=2,
        alpha: float = 1.0,
        negative_sample_rate: int = 5,
        gamma: float = 1.0,
        spread=1.0,
        min_dist=0.1,
        init_pos='spectral',
        random_state=1,
    ):
        """ Perform UMAP dimensionality reduction from constructed NN graph 

        :param n_components: number of dimensions to reduce, defaults to 2
        :type n_components: int, optional
        :param alpha: [description], defaults to 1.0
        :type alpha: float, optional
        :param negative_sample_rate: [description], defaults to 5
        :type negative_sample_rate: int, optional
        :param gamma: [description], defaults to 1.0
        :type gamma: float, optional
        :param spread: [description], defaults to 1.0
        :type spread: float, optional
        :param min_dist: [description], defaults to 0.1
        :type min_dist: float, optional
        :param init_pos: [description], defaults to 'spectral'
        :type init_pos: str, optional
        :param random_state: [description], defaults to 1
        :type random_state: int, optional
        :return: [description]
        :rtype: [type]
        """
        ## pre-process data for umap
        n_neighbors = self.nn_neighbor_array.shape[1]
        n_cells = self.nn_neighbor_array.shape[0]
        row_list = np.transpose(
            np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten()
        row_min = np.min(self.nn_distance_array, axis=1)
        row_sigma = np.std(self.nn_distance_array, axis=1)

        distance_array = (self.nn_distance_array -
                          row_min[:, np.newaxis]) / row_sigma[:, np.newaxis]
        col_list = self.nn_neighbor_array.flatten().tolist()
        distance_array = distance_array.flatten()
        distance_array = np.sqrt(distance_array)
        distance_array = distance_array * -1

        weight_list = np.exp(distance_array)

        threshold = np.mean(weight_list) + 2 * np.std(weight_list)

        weight_list[weight_list >= threshold] = threshold

        weight_list = weight_list.tolist()

        graph = csr_matrix(
            (np.array(weight_list), (np.array(row_list), np.array(col_list))),
            shape=(n_cells, n_cells))

        graph_transpose = graph.T
        prod_matrix = graph.multiply(graph_transpose)

        graph = graph_transpose + graph - prod_matrix
        from umap.umap_ import find_ab_params, simplicial_set_embedding
        import matplotlib.pyplot as plt

        a, b = find_ab_params(spread, min_dist)
        X_umap = simplicial_set_embedding(
            data=self.data,
            graph=graph,
            n_components=n_components,
            initial_alpha=alpha,
            a=a,
            b=b,
            n_epochs=0,
            metric_kwds={},
            gamma=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=np.random.RandomState(random_state),
            metric='euclidean',
            verbose=False,
            densmap=False,
            densmap_kwds={},
            output_dens=False)
        return X_umap
Ejemplo n.º 12
0
    def fit_embed_data(self, X, y, index, inverse):
        """
        Performs an embedding on data after a UMAP graph has been constructed.

        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row. If the method
            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
            or 'coo'.
        y : array, shape (n_samples)
            A target array for supervised dimension reduction. How this is
            handled is determined by parameters UMAP was instantiated with.
            The relevant attributes are ``target_metric`` and
            ``target_metric_kwds``.
        index : array, shape (n_samples)
            [description]
        inverse : array, shape (n_samples)
            [description]
        """
        if self.n_epochs is None:
            n_epochs = 0
        else:
            n_epochs = self.n_epochs

        if self.densmap or self.output_dens:
            self._densmap_kwds["graph_dists"] = self.graph_dists_

        if self.verbose:
            print(ts(), "Construct embedding")

        self.embedding_, aux_data = simplicial_set_embedding(
            self._raw_data[self.index__],  # JH why raw data?
            self.graph_,
            self.n_components,
            self._initial_alpha,
            self._a,
            self._b,
            self.repulsion_strength,
            self.negative_sample_rate,
            n_epochs,
            init,
            random_state,
            self._input_distance_func,
            self._metric_kwds,
            self.densmap,
            self._densmap_kwds,
            self.output_dens,
            self._output_distance_func,
            self._output_metric_kwds,
            self.output_metric in ("euclidean", "l2"),
            self.random_state is None,
            self.verbose,
        )

        self.embedding_ = self.embedding_[self.inverse__]
        if self.output_dens:
            self.rad_orig_ = aux_data["rad_orig"][self.inverse__]
            self.rad_emb_ = aux_data["rad_emb"][self.inverse__]

        if self.verbose:
            print(ts() + " Finished embedding")

        numba.set_num_threads(self._original_n_threads)
        self._input_hash = joblib.hash(self._raw_data)
Ejemplo n.º 13
0
def umap_conn_indices_dist_embedding(
    X,
    n_neighbors=30,
    n_components=2,
    metric="euclidean",
    min_dist=0.1,
    spread=1.0,
    n_epochs=0,
    alpha=1.0,
    gamma=1.0,
    negative_sample_rate=5,
    init_pos="spectral",
    random_state=0,
    return_mapper=True,
    verbose=False,
    **umap_kwargs
):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance matrix and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
        X: sparse matrix (`.X`, dtype `float32`)
            expression matrix (n_cell x n_genes)
        n_neighbors: 'int' (optional, default 15)
            The number of nearest neighbors to compute for each sample in ``X``.
        n_components: 'int' (optional, default 2)
            The dimension of the space to embed into.
        metric: 'str' or `callable` (optional, default `cosine`)
            The metric to use for the computation.
        min_dist: 'float' (optional, default `0.1`)
            The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
            embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
            more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
            scale at which embedded points will be spread out.
        spread: `float` (optional, default 1.0)
            The effective scale of embedded points. In combination with min_dist this determines how clustered/clumped the
            embedded points are.
        n_epochs: 'int' (optional, default 0)
            The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in
            more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset
            (200 for large datasets, 500 for small).
        alpha: `float` (optional, default 1.0)
            Initial learning rate for the SGD.
        gamma: `float` (optional, default 1.0)
            Weight to apply to negative samples. Values higher than one will result in greater weight being given to
            negative samples.
        negative_sample_rate: `float` (optional, default 5)
            The number of negative samples to select per positive sample in the optimization process. Increasing this value
             will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
             The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low
             dimensional embedding.
        init_pos: 'spectral':
            How to initialize the low dimensional embedding. Use a spectral embedding of the fuzzy 1-skeleton
        random_state: `int`, `RandomState` instance or `None`, optional (default: None)
            If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
            the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
        verbose: `bool` (optional, default False)
            Controls verbosity of logging.

    Returns
    -------
        graph, knn_indices, knn_dists, embedding_
            A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of nearest
            neighbors (knn_dists) and finally the low dimensional embedding (embedding_).
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import (
        nearest_neighbors,
        fuzzy_simplicial_set,
        simplicial_set_embedding,
        find_ab_params,
    )

    random_state = check_random_state(random_state)

    _raw_data = X

    if X.shape[0] < 4096:  # 1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(
            X=dmat,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric="precomputed",
            verbose=verbose,
        )
        if type(graph) == tuple: graph = graph[0]

        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = extract_indices_dist_from_graph(
            g_tmp, n_neighbors=n_neighbors
        )
    else:
        # Standard case
        (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
            X=X,
            n_neighbors=n_neighbors,
            metric=metric,
            metric_kwds={},
            angular=False,
            random_state=random_state,
            verbose=verbose,
        )

        graph = fuzzy_simplicial_set(
            X=X,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric=metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists,
            angular=rp_forest,
            verbose=verbose,
        )

        _raw_data = X
        _transform_available = True
        _search_graph = scipy.sparse.lil_matrix((X.shape[0], X.shape[0]), dtype=np.int8)
        _search_graph.rows = knn_indices  # An array (self.rows) of rows, each of which is a sorted list of column indices of non-zero elements.
        _search_graph.data = (knn_dists != 0).astype(
            np.int8
        )  # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()
        ).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(spread, min_dist)
    if type(graph) == tuple: graph = graph[0]
    embedding_ = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=alpha,  # learning_rate
        a=a,
        b=b,
        gamma=gamma,
        negative_sample_rate=negative_sample_rate,
        n_epochs=n_epochs,
        init=init_pos,
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose,
    )

    if return_mapper:
        import umap
        from .utils import update_dict

        if n_epochs == 0:
            n_epochs = None

        _umap_kwargs = {
            "angular_rp_forest": False,
            "local_connectivity": 1.0,
            "metric_kwds": None,
            "set_op_mix_ratio": 1.0,
            "target_metric": "categorical",
            "target_metric_kwds": None,
            "target_n_neighbors": -1,
            "target_weight": 0.5,
            "transform_queue_size": 4.0,
            "transform_seed": 42,
        }
        umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)

        mapper = umap.UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric=metric,
            min_dist=min_dist,
            spread=spread,
            n_epochs=n_epochs,
            learning_rate=alpha,
            repulsion_strength=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=random_state,
            verbose=verbose,
            **umap_kwargs
        ).fit(X)

        return mapper, graph, knn_indices, knn_dists, embedding_
    else:
        return graph, knn_indices, knn_dists, embedding_