コード例 #1
0
def net_fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_target_steps: int = 1500,
    out_basis: str = "net_fle",
) -> None:
    """Construct Net-Force-directed (FLE) graph.

    Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_target_steps: ``int``, optional, default: ``1500``
        After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations.

    out_basis: ``str``, optional, default: ``"net_fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_fle(data)
    """

    if file_name is None:
        if file_name is None:
            import tempfile

            _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(X,
                                                     K=K,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state,
                                                     full_speed=full_speed)
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    W = calculate_affinity_matrix(indices, distances)

    X_fle = calc_force_directed_layout(
        W,
        file_name + ".small",
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_fle
    data.obs["ds_diffmap_selected"] = selected

    n_components = 2 if not is3d else 3
    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_fle
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_fle,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        polish_target_steps,
        is3d,
        memory,
        random_state,
        init=Y_init,
    )
コード例 #2
0
def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    n_jobs = effective_n_jobs(n_jobs)

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    data.obsm["X_" + out_basis] = calc_umap(
        X_full,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
コード例 #3
0
def integrative_nmf(
    data: Union[MultimodalData, UnimodalData],
    batch: str = "Channel",
    n_components: int = 20,
    features: str = "highly_variable_features",
    space: str = "log",
    algo: str = "halsvar",
    mode: str = "online",
    tol: float = 1e-4,
    use_gpu: bool = False,
    lam: float = 5.0,
    fp_precision: str = "float",
    n_jobs: int = -1,
    random_state: int = 0,
    quantile_norm: bool = True,
) -> str:
    """Perform Integrative Nonnegative Matrix Factorization (iNMF) [Yang16]_ for data integration.

    The calculation uses `nmf-torch <https://github.com/lilab-bcb/nmf-torch>`_ .

    This function assumes that cells in each batch are adjacent to each other.
    In addition, it will scale each batch with L2 norm separately. The resulting Hs will also be scaled with L2 norm.
    If ``quantile_norm=True``, quantile normalization will be additionally performed.

    See [Welch19]_ and [Gao21]_ for preprocessing and normalization details.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    batch: ``str``, optional, default: ``"Channel"``.
        Which attribute in data.obs field represents batches, default is "Channel".

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for integrative_nmf.

    space: ``str``, optional, default: ``log``.
        Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs).

    algo: ``str``, optional, default: ``halsvar``
        Choose from ``mu`` (Multiplicative Update), ``halsvar`` (HALS variant that mimic bpp but faster) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method).

    mode: ``str``, optional, default: ``online``
        Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method.

    tol: ``float``, optional, default: ``1e-4``
        The toleration used for convergence check.

    use_gpu: ``bool``, optional, default: ``False``
        If ``True``, use GPU if available. Otherwise, use CPU only.

    lam: ``float``, optional, default: ``5.0``
        The coefficient for regularization terms. If ``0``, then no regularization will be performed.

    fp_precision: ``str``, optional, default: ``float``
        The numeric precision on the results. Choose from ``float`` and ``double``.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    quantile_norm: ``bool``, optioanl, default: ``True``.
        Perform quantile normalization as described in Gao et al. Nature Biotech 2021. Cluster refinement K=20; min_cells=20; quantiles = 50.

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by integrative NMF algorithm. out_rep is always equal to "inmf"


    Update ``data.obsm``:

        * ``data.obsm["X_inmf"]``: Scaled and possibly quantile normalized iNMF coordinates.

        * ``data.obsm["H"]``: The concatenation of coordinate factor matrices of shape ``(n_cells, n_components)``.

    Update ``data.uns``:

        * ``data.uns["W"]``: The feature factor matrix of shape ``(n_HVFs, n_components)``.

        * ``data.uns["V"]``: The batch specific feature factor matrices as one tensor of shape ``(n_batches, n_components, n_HVFs)``.

        * ``data.uns["inmf_err"]``: The iNMF loss.

        * ``data.uns["inmf_features"]``: Record the features used to perform iNMF analysis.

    Examples
    --------
    >>> pg.integrative_nmf(data)
    """
    if not check_batch_key(data, batch, "Cannot apply integrative_nmf!"):
        return "pca"

    Xs = _select_and_scale_features(data,
                                    features=features,
                                    space=space,
                                    batch=batch)

    try:
        from nmf import integrative_nmf
    except ImportError as e:
        import sys
        logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.")
        sys.exit(-1)

    n_jobs = eff_n_jobs(n_jobs)

    Hs, W, Vs, err = integrative_nmf(
        Xs,
        n_components=n_components,
        algo=algo,
        mode=mode,
        tol=tol,
        n_jobs=n_jobs,
        random_state=random_state,
        use_gpu=use_gpu,
        lam=lam,
        fp_precision=fp_precision,
    )

    # Implementation of algo 3, quantile normalization
    Hs_new = numbaList()
    csums = numbaList()
    ids_by_clusts = numbaList()

    nbatch = len(Hs)
    rg = np.random.default_rng(random_state)
    seeds = rg.integers(4294967295, size=nbatch)
    ref_batch = max_size = -1
    for i in range(nbatch):
        H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0),
                                     dtype=np.float32)  # Scale H
        Hs_new.append(H_new)  # Append scaled H

        if not quantile_norm:
            continue

        clusters = np.argmax(H_new, axis=1)  # Assign cluster
        indices, _ = calculate_nearest_neighbors(
            H_new, K=20, n_jobs=n_jobs, random_state=seeds[i])  # KNN with K=20
        clusters, csum = _refine_cluster(clusters, indices,
                                         n_components)  # Refine cluster
        csums.append(csum)
        ids_by_clusts.append(np.argsort(clusters, kind='stable'))

        if H_new.shape[0] > max_size:  # Find ref batch
            max_size = H_new.shape[0]
            ref_batch = i

    if quantile_norm:
        _quantile_norm(Hs_new, csums, ids_by_clusts, nbatch, ref_batch,
                       n_components)  # quantile normalization

    data.uns["inmf_features"] = features  # record which feature to use
    data.uns["W"] = np.ascontiguousarray(
        W.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["V"] = np.array(Vs)
    data.uns["inmf_err"] = err

    data.obsm["H"] = np.concatenate(Hs)
    data.obsm["X_inmf"] = np.concatenate(Hs_new)

    return "inmf"
コード例 #4
0
ファイル: doublet_detection.py プロジェクト: slowkow/pegasus
def _run_scrublet(data: Union[MultimodalData, UnimodalData],
                  name: Optional[str] = '',
                  expected_doublet_rate: Optional[float] = None,
                  sim_doublet_ratio: Optional[float] = 2.0,
                  n_prin_comps: Optional[int] = 30,
                  robust: Optional[bool] = False,
                  k: Optional[int] = None,
                  n_jobs: Optional[int] = -1,
                  random_state: Optional[int] = 0,
                  plot_hist: Optional[bool] = True) -> Union[None, "Figure"]:
    """Calculate doublet scores using Scrublet-like [Wolock18]_ strategy for the current data.X; determine a right threshold using Gaussian Mixture model.
       This function should be called after highly_variable_gene selection.

    Parameters
    -----------
    data: ``Union[MultimodalData, UnimodalData]`` object.
        Annotated data matrix with rows for cells and columns for genes. Data must be low quality cell and gene filtered and log-transformed. Assume 'raw.X' stores the raw count matrix.

    name: ``str``, optional, default: ``''``
        Name of the sample.

    expected_doublet_rate: ``float``, optional, default: ``None``
        The expected doublet rate for the experiment. By default, calculate the expected rate based on number of cells from the 10x multiplet rate table

    sim_doublet_ratio: ``float``, optional, default: ``2.0``
        The ratio between synthetic doublets and observed cells.

    n_prin_comps: ``int``, optional, default: ``30``
        Number of principal components.

    robust: ``bool``, optional, default: ``False``.
        If true, use 'arpack' instead of 'randomized' for large matrices (i.e. max(X.shape) > 500 and n_components < 0.8 * min(X.shape))

    k: ``int``, optional, default: ``None``
        Number of observed cell neighbors. If None, k = round(0.5 * sqrt(number of observed cells)). Total neighbors k_adj = round(k * (1.0 + sim_doublet_ratio)).

    n_job: ``int``, optional, default: ``-``
        Number of threads to use. If ``-1``, use all available threads.

    random_state: ``int``, optional, default: ``0``
        Random state for doublet simulation, PCA and approximate nearest neighbor search.

    plot_hist: ``bool``, optional, default: ``True``
        If True, plot diagnostic histogram.

    Returns
    --------
    ``None`` or a ``matplotlib Figure object`` if

    Update ``data.obs``:
        * ``data.obs['doublet_score']``: The calculated doublet scores on cells.
        * ``data.obs['pred_dbl']``: Predicted doublets as True.

    Update ``data.uns``:
        * ``data.uns['doublet_threshold']``: Inferred doublet threshold; any score > threshold is identified as a neotypic doublet.

    Examples
    --------
    >>> pg.run_scrublet(data)
    """
    from pegasus.tools import calculate_nearest_neighbors
    from pegasus.cylib.fast_utils import simulate_doublets
    from sklearn.decomposition import PCA
    from scipy.stats import gaussian_kde

    if "highly_variable_features" not in data.var:
        raise ValueError(
            "_run_scrublet must be run after highly_variable_features is called!"
        )

    r = sim_doublet_ratio
    if expected_doublet_rate is None:
        expected_doublet_rate = _calc_expected_doublet_rate(data.shape[0])
    rho = expected_doublet_rate

    # subset the raw count matrix
    rawX = data.get_matrix("raw.X")
    obs_umis = rawX.sum(axis=1, dtype=np.int32).A1
    rawX = rawX[:, data.var["highly_variable_features"].values]
    # Simulate synthetic doublets
    sim_rawX, pair_idx = simulate_doublets(rawX, r, random_state)
    sim_umis = obs_umis[pair_idx].sum(axis=1, dtype=np.int32)

    # standardize and calculate PCA for rawX
    obsX = rawX.astype(np.float32).toarray()
    obsX /= obs_umis.reshape(-1, 1)  # normalize each cell

    m1 = obsX.mean(axis=0)  # calculate mean and std
    psum = np.multiply(obsX, obsX).sum(axis=0)
    std = ((psum - obsX.shape[0] * (m1**2)) / (obsX.shape[0] - 1.0))**0.5
    std[std == 0] = 1

    obsX -= m1  # standardize
    obsX /= std

    svd_solver = "auto" if not robust else (
        "arpack" if max(obsX.shape) > 500
        and n_prin_comps < 0.8 * min(obsX.shape) else "full")  # PCA
    pca = PCA(n_components=n_prin_comps,
              random_state=random_state,
              svd_solver=svd_solver)
    obs_pca = pca.fit_transform(obsX)

    # standardize and calculate PCA for sim_rawX
    simX = sim_rawX.astype(np.float32).toarray()
    simX /= sim_umis.reshape(-1, 1)  # normalize each cell

    simX -= m1  # standardize
    simX /= std

    sim_pca = pca.transform(simX)  # transform to PC coordinates

    # concatenate observed and simulated data
    pc_coords = np.vstack((obs_pca, sim_pca))
    is_doublet = np.repeat(np.array([0, 1], dtype=np.int32),
                           [obsX.shape[0], simX.shape[0]])

    # Calculate k nearest neighbors
    if k is None:
        k = int(round(0.5 * np.sqrt(obsX.shape[0])))
    k_adj = int(round(k * (1.0 + r)))
    indices, _ = calculate_nearest_neighbors(pc_coords,
                                             K=k_adj + 1,
                                             n_jobs=n_jobs)

    # Calculate scrublet-like doublet score
    k_d = is_doublet[indices].sum(axis=1)
    q = (k_d + 1.0) / (k_adj + 2.0)  # Equation 5
    doublet_scores = (q * rho / r) / (
        (1.0 - rho) - q * (1.0 - rho - rho / r))  # Equation 4
    obs_scores = doublet_scores[0:obsX.shape[0]]
    sim_scores = doublet_scores[obsX.shape[0]:]

    # Determine a scrublet score threshold
    # log transformed
    sim_scores_log = np.log(sim_scores)

    # Estimate KDE
    min_score = sim_scores_log.min()
    max_score = sim_scores_log.max()
    min_gap = np.diff(np.unique(np.sort(sim_scores_log))).min()
    from math import ceil
    n_gap = max(int(ceil((max_score - min_score) / min_gap)),
                200)  # minimum is 200
    gap = (max_score - min_score) / n_gap

    n_ext = 5
    min_score -= gap * n_ext
    max_score += gap * n_ext
    x = np.linspace(min_score, max_score,
                    n_gap + 1 + n_ext * 2)  # generate x coordinates
    kde = gaussian_kde(sim_scores_log)
    y = kde(x)

    # Find local maxima
    maxima, maxima_by_x, filtered_maxima = _find_local_maxima(y)
    assert maxima.size > 0
    curv = _calc_vec_f(_curvature, x.size, y, gap)  # calculate curvature

    if maxima.size >= 2:
        if maxima[0] < maxima[1]:
            start = maxima[0]
            end = maxima[1]
        else:
            start = maxima[1]
            end = maxima[0]
        pos = y[start + 1:end].argmin() + (start + 1)
    else:
        frac_right_thre = 0.42
        frac_left_thre = 0.4

        pos = -1
        for i in range(maxima_by_x.size):
            frac_right = (sim_scores_log >
                          x[maxima_by_x[i]]).sum() / sim_scores.size
            if frac_right < frac_right_thre:  # peak might represent a doublet peak, try to find a cutoff at the left side
                if i == 0:
                    peak_curv_value = _find_curv_minima_at_peak(
                        curv, maxima_by_x[i])
                    end = _find_pos_curv(curv, maxima_by_x[i] - 1, '-')
                    start = _find_pos_curv(
                        curv,
                        _find_curv_local_minima(curv, peak_curv_value,
                                                filtered_maxima, end - 1, '-')
                        + 1, '+')
                    assert start <= end
                    pos = curv[start:end + 1].argmax() + start
                else:
                    pos = y[maxima_by_x[i - 1] + 1:maxima_by_x[i]].argmin() + (
                        maxima_by_x[i - 1] + 1)

                frac_left = (sim_scores_log < x[pos]).sum() / sim_scores.size
                if frac_left < frac_left_thre:
                    pos = maxima_by_x[i]

                break

        if pos < 0:
            # peak represents singlet, find a cutoff at the right side
            peak_curv_value = _find_curv_minima_at_peak(curv, maxima_by_x[-1])
            start = _find_pos_curv(curv, maxima_by_x[-1] + 1, '+')
            end = _find_pos_curv(
                curv,
                _find_curv_local_minima(curv, peak_curv_value, filtered_maxima,
                                        start + 1, '+') - 1, '-')
            assert start <= end
            pos = curv[start:end + 1].argmax() + start

    threshold = np.exp(x[pos])

    data.obs["doublet_score"] = obs_scores.astype(np.float32)
    data.obs["pred_dbl"] = obs_scores > threshold
    data.uns["doublet_threshold"] = float(threshold)

    logger.info(
        f"Sample {name}: doublet threshold = {threshold:.4f}; total cells = {data.shape[0]}; neotypic doublet rate = {data.obs['pred_dbl'].sum() / data.shape[0]:.2%}"
    )
    fig = None
    if plot_hist:
        fig = _plot_hist(obs_scores, sim_scores, threshold, x, y, curv)
    return fig
コード例 #5
0
def _run_scrublet(
    data: Union[MultimodalData, UnimodalData],
    raw_mat_key: Optional[str] = 'counts',
    name: Optional[str] = '',
    expected_doublet_rate: Optional[float] = None,
    sim_doublet_ratio: Optional[float] = 2.0,
    n_prin_comps: Optional[int] = 30,
    k: Optional[int] = None,
    n_jobs: Optional[int] = -1,
    random_state: Optional[int] = 0,
    plot_hist: Optional[bool] = True,
    manual_correction: Optional[str] = None,
) -> Union[None, Figure]:
    """Calculate doublet scores using Scrublet-like [Wolock18]_ strategy for the current data.X; determine a right threshold based on the KDE curve.
       This function should be called after highly_variable_gene selection.

    Parameters
    -----------
    data: ``Union[MultimodalData, UnimodalData]`` object.
        Annotated data matrix with rows for cells and columns for genes. Data must be low quality cell and gene filtered and log-transformed.

    raw_mat_key: ``str``, optional, default: ``counts``
        Matrix key for the raw count matrix.

    name: ``str``, optional, default: ``''``
        Name of the sample.

    expected_doublet_rate: ``float``, optional, default: ``None``
        The expected doublet rate for the experiment. By default, calculate the expected rate based on number of cells from the 10x multiplet rate table

    sim_doublet_ratio: ``float``, optional, default: ``2.0``
        The ratio between synthetic doublets and observed cells.

    n_prin_comps: ``int``, optional, default: ``30``
        Number of principal components.

    k: ``int``, optional, default: ``None``
        Number of observed cell neighbors. If None, k = round(0.5 * sqrt(number of observed cells)). Total neighbors k_adj = round(k * (1.0 + sim_doublet_ratio)).

    n_jobs: ``int``, optional, default: ``-``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random state for doublet simulation, PCA and approximate nearest neighbor search.

    plot_hist: ``bool``, optional, default: ``True``
        If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.

    manual_correction: ``str``, optional, default: ``None``
        If present, use human guide provided in manual_correction to select threshold. Currently only support manual_correction='peak', which means cut at the center of the peak.

    Returns
    --------
    ``None`` or a ``matplotlib Figure object`` if

    Update ``data.obs``:
        * ``data.obs['doublet_score']``: The calculated doublet scores on cells.
        * ``data.obs['pred_dbl']``: Predicted doublets as True.

    Update ``data.uns``:
        * ``data.uns['doublet_threshold']``: Inferred doublet threshold; any score > threshold is identified as a neotypic doublet.

    Examples
    --------
    >>> pg.run_scrublet(data)
    """
    from pegasus.tools import calculate_nearest_neighbors, simulate_doublets
    from sklearn.decomposition import PCA
    from scipy.stats import gaussian_kde
    from sklearn.cluster import KMeans

    if "highly_variable_features" not in data.var:
        raise ValueError(
            "_run_scrublet must be run after highly_variable_features is called!"
        )

    r = sim_doublet_ratio
    if expected_doublet_rate is None:
        expected_doublet_rate = _calc_expected_doublet_rate(data.shape[0])
    rho = expected_doublet_rate

    # subset the raw count matrix
    rawX = data.get_matrix(raw_mat_key)
    obs_umis = rawX.sum(axis=1, dtype=np.int32).A1
    rawX = rawX[:, data.var["highly_variable_features"].values]
    # Simulate synthetic doublets
    sim_rawX, pair_idx = simulate_doublets(rawX, r, random_state)
    sim_umis = obs_umis[pair_idx].sum(axis=1, dtype=np.int32)

    # standardize and calculate PCA for rawX
    obsX = rawX.astype(np.float32).toarray()
    obsX /= obs_umis.reshape(-1, 1)  # normalize each cell

    m1 = obsX.mean(axis=0)  # calculate mean and std
    psum = np.multiply(obsX, obsX).sum(axis=0)
    std = ((psum - obsX.shape[0] * (m1**2)) / (obsX.shape[0] - 1.0))**0.5
    std[std == 0] = 1

    obsX -= m1  # standardize
    obsX /= std

    pca = PCA(n_components=n_prin_comps, random_state=random_state)
    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits=n_jobs):
        obs_pca = pca.fit_transform(obsX.astype(
            np.float64))  # float64 for reproducibility
        obs_pca = np.ascontiguousarray(obs_pca, dtype=np.float32)
        kmeans = KMeans(n_clusters=5, random_state=random_state).fit(obs_pca)

    # calculate in simulated distribution, expected percentage of embedded doublets
    data.obs["dbl_kmeans_"] = pd.Categorical(kmeans.labels_)
    _, freqs = np.unique(kmeans.labels_, return_counts=True)
    freqs = np.array(freqs) / sum(freqs)
    d_emb = (((1.0 - rho) * freqs + rho * (freqs**2))**2).sum()
    d_neo = 1.0 - d_emb

    # standardize and calculate PCA for sim_rawX
    simX = sim_rawX.astype(np.float32).toarray()
    simX /= sim_umis.reshape(-1, 1)  # normalize each cell

    simX -= m1  # standardize
    simX /= std

    sim_pca = pca.transform(simX)  # transform to PC coordinates
    sim_pca = np.ascontiguousarray(sim_pca, dtype=np.float32)

    # concatenate observed and simulated data
    pc_coords = np.vstack((obs_pca, sim_pca))
    is_doublet = np.repeat(np.array([0, 1], dtype=np.int32),
                           [obsX.shape[0], simX.shape[0]])

    # Calculate k nearest neighbors
    if k is None:
        k = int(round(0.5 * np.sqrt(obsX.shape[0])))
    k_adj = int(round(k * (1.0 + r)))
    indices, _ = calculate_nearest_neighbors(pc_coords,
                                             K=k_adj + 1,
                                             n_jobs=n_jobs)

    # Calculate scrublet-like doublet score
    k_d = is_doublet[indices].sum(axis=1)
    q = (k_d + 1.0) / (k_adj + 2.0)  # Equation 5
    doublet_scores = (q * rho / r) / (
        (1.0 - rho) - q * (1.0 - rho - rho / r))  # Equation 4
    obs_scores = doublet_scores[0:obsX.shape[0]]
    sim_scores = doublet_scores[obsX.shape[0]:]

    # Determine a scrublet score threshold
    # log transformed
    sim_scores_log = np.log(sim_scores)

    # Estimate KDE
    min_score = sim_scores_log.min()
    max_score = sim_scores_log.max()
    min_gap = np.diff(np.unique(np.sort(sim_scores_log))).min()
    from math import ceil
    n_gap = max(int(ceil((max_score - min_score) / min_gap)),
                200)  # minimum is 200
    gap = (max_score - min_score) / n_gap

    n_ext = 5
    min_score -= gap * n_ext
    max_score += gap * n_ext
    x = np.linspace(min_score, max_score,
                    n_gap + 1 + n_ext * 2)  # generate x coordinates
    kde = gaussian_kde(sim_scores_log)
    y = kde(x)

    # Find local maxima
    maxima, maxima_by_x, filtered_maxima = _find_local_maxima(y)
    assert maxima.size > 0
    curv = _calc_vec_f(_curvature, x.size, y, gap)  # calculate curvature

    x_theory = np.percentile(sim_scores_log, d_emb * 100.0 + 1e-6)
    threshold_theory = np.exp(x_theory)

    case_num = -1
    pos = -1
    if maxima.size >= 2:
        pos = _locate_cutoff_among_peaks_with_guide(x, y, maxima,
                                                    sim_scores_log, d_neo)
        case_num = 0
        d_pneo = (sim_scores_log > x[pos]).sum() / sim_scores_log.size
        if d_pneo < 0.1:  # < 10%, consider it as not a peak
            idx_ = maxima_by_x >= pos
            filtered_maxima = np.concatenate(
                (filtered_maxima, maxima_by_x[idx_]))
            maxima_by_x = maxima_by_x[~idx_]
            pos = -1
    if pos < 0:
        frac_right = (sim_scores_log >
                      x[maxima_by_x[-1]]).sum() / sim_scores.size
        if frac_right < 0.41 or (frac_right < 0.5
                                 and x_theory + 0.05 < x[maxima_by_x[-1]]):
            logger.debug(f"frac_right={frac_right}.")
            if maxima_by_x.size > 1:
                posvec = np.vectorize(
                    lambda i: y[maxima_by_x[i] + 1:maxima_by_x[i + 1]].argmin(
                    ) + (maxima_by_x[i] + 1))(range(maxima_by_x.size - 1))
                pos = posvec[np.argmin(np.abs(x[posvec] - x_theory))]
                case_num = 1
            else:
                pos = _find_cutoff_left_side(maxima_by_x[0], x, curv, x_theory)
                case_num = 2
        else:
            pos = _find_cutoff_right_side(maxima_by_x[-1], curv,
                                          filtered_maxima)
            case_num = 3
    threshold = np.exp(x[pos])

    threshold_auto = None
    if manual_correction is not None:
        assert case_num == 2
        threshold_auto = threshold
        threshold = np.exp(x[maxima_by_x[-1]])

    data.obs["doublet_score"] = obs_scores.astype(np.float32)
    data.obs["pred_dbl"] = obs_scores > threshold
    data.uns["doublet_threshold"] = float(threshold)

    neo_dbl_rate = data.obs['pred_dbl'].sum() / data.shape[0]
    neo_sim_dbl_rate = (sim_scores > threshold).sum() / sim_scores.size
    logger.info(
        f"Sample {name}: doublet threshold = {threshold:.4f}; total cells = {data.shape[0]}; neotypic doublet rate in simulation = {neo_sim_dbl_rate:.2%}; neotypic doublet rate = {neo_dbl_rate:.2%}."
    )

    fig = None
    if plot_hist:
        fig = _plot_hist(obs_scores,
                         sim_scores,
                         threshold,
                         threshold_theory,
                         x,
                         y,
                         curv,
                         threshold_auto=threshold_auto)
    return fig
コード例 #6
0
def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    densmap: bool = False,
    dens_lambda: float = 2.0,
    dens_frac: float = 0.3,
    dens_var_shift: float = 0.1,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    densmap: ``bool``, optional, default: ``False``
        Whether the density-augmented objective of densMAP should be used for optimization, which will generate an embedding where
        local densities are encouraged to be correlated with those in the original space.

    dens_lambda: ``float``, optional, default: ``2.0``
        Controls the regularization weight of the density correlation term in densMAP. Only works when *densmap* is ``True``.
        Larger values prioritize density preservation over the UMAP objective, while values closer to 0 for the opposite direction.
        Notice that setting this parameter to ``0`` is equivalent to running the original UMAP algorithm.

    dens_frac: ``float``, optional, default: ``0.3``
        Controls the fraction of epochs (between 0 and 1) where the density-augmented objective is used in densMAP. Only works when
        *densmap* is ``True``.
        The first ``(1 - dens_frac)`` fraction of epochs optimize the original UMAP objective before introducing the density
        correlation term.

    dens_var_shift: ``float``, optional, default, ``0.1``
        A small constant added to the variance of local radii in the embedding when calculating the density correlation objective to
        prevent numerical instability from dividing by a small number. Only works when *densmap* is ``True``.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)
    knn_indices, knn_dists = get_neighbors(data,
                                           K=select_K,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)

    selected = select_cells(
        knn_dists,
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    if data.shape[0] < n_neighbors:
        logger.warning(
            f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}."
        )
        n_neighbors = data.shape[0]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        spread=spread,
        densmap=densmap,
        dens_lambda=dens_lambda,
        dens_frac=dens_frac,
        dens_var_shift=dens_var_shift,
        random_state=random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 n_jobs,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices, knn_dists = get_neighbors(data,
                                           K=n_neighbors,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)
    knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1)

    key = f"X_{out_basis}"
    data.obsm[key] = calc_umap(
        X_full,
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        spread=spread,
        densmap=densmap,
        dens_lambda=dens_lambda,
        dens_frac=dens_frac,
        dens_var_shift=dens_var_shift,
        random_state=random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
    data.register_attr(key, "basis")