Ejemplo n.º 1
0
def log_norm(
    data: MultimodalData,
    norm_count: float = 1e5,
    backup_matrix: str = "raw.X",
) -> None:
    """Normalization, and then apply natural logarithm to the data.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.

    norm_count: ``int``, optional, default: ``1e5``.
        Total counts of one cell after normalization.

    backup_matrix: ``str``, optional, default: ``raw.X``.
        The key name of the backup count matrix, usually the raw counts.

    Returns
    -------
    ``None``

    Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``.

    In case of rerunning normalization while ``backup_matrix`` already exists, use ``backup_matrix`` instead of ``data.X`` for normalization.

    Examples
    --------
    >>> pg.log_norm(data)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    assert data.get_modality() == "rna"

    if backup_matrix not in data.list_keys():
        data.add_matrix(backup_matrix, data.X)
        data.X = data.X.astype(np.float32)  # force copy
    else:
        # The case of rerunning log_norm. Use backup matrix as source.
        data.X = data.get_matrix(backup_matrix).astype(
            np.float32)  # force copy
        logger.warning(
            "Rerun log-normalization. Use the raw counts in backup instead.")

    data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values,
                                           norm_count, True)
    data.uns["norm_count"] = norm_count
def identify_robust_genes(data: MultimodalData,
                          percent_cells: float = 0.05) -> None:
    """ Identify robust genes as candidates for HVG selection and remove genes that are not expressed in any cells.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.
    percent_cells: ``float``, optional, default: ``0.05``
       Only assign genes to be ``robust`` that are expressed in at least ``percent_cells`` % of cells.

    Returns
    -------
    ``None``

    Update ``data.var``:

        * ``n_cells``: Total number of cells in which each gene is measured.
        * ``percent_cells``: Percent of cells in which each gene is measured.
        * ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
        * ``highly_variable_features``: Boolean type indicating if a gene is a highly variable feature. By default, set all robust genes as highly variable features.

    Examples
    --------
    >>> pg.identify_robust_genes(data, percent_cells = 0.05)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    prior_n = data.shape[1]

    if issparse(data.X):
        data.var["n_cells"] = data.X.getnnz(axis=0)
        data._inplace_subset_var(data.var["n_cells"] > 0)
        data.var["percent_cells"] = (data.var["n_cells"] / data.shape[0]) * 100
        data.var["robust"] = data.var["percent_cells"] >= percent_cells
    else:
        data.var["robust"] = True

    data.var["highly_variable_features"] = data.var[
        "robust"]  # default all robust genes are "highly" variable
    logger.info(
        f"After filtration, {data.shape[1]}/{prior_n} genes are kept. Among {data.shape[1]} genes, {data.var['robust'].sum()} genes are robust."
    )
Ejemplo n.º 3
0
def log_norm(data: MultimodalData,
             norm_count: float = 1e5,
             backup_matrix: str = "raw.X") -> None:
    """Normalization, and then apply natural logarithm to the data.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.

    norm_count: ``int``, optional, default: ``1e5``.
        Total count of cells after normalization.

    backup_matrix: ``str``, optional, default: ``raw.X``.
        Where to back up the count matrix.

    Returns
    -------
    ``None``

    Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``.

    Examples
    --------
    >>> pg.log_norm(data)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    assert data.get_modality() == "rna"

    data.add_matrix(backup_matrix, data.X)
    data.X = data.X.astype(np.float32)  # force copy

    from pegasus.cylib.fast_utils import normalize_by_count
    data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values,
                                           norm_count, True)
    data.uns["norm_count"] = norm_count
def get_filter_stats(data: MultimodalData,
                     min_genes_before_filt: int = 100) -> pd.DataFrame:
    """Calculate filtration stats on cell barcodes.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.
    min_genes_before_filt: ``int``, optional, default ``100``
        If raw data matrix is input, empty barcodes will dominate pre-filtration statistics. To avoid this, for raw matrix, only consider barcodes with at least <number> genes for pre-filtration condition.

    Returns
    -------
    df_cells: ``pandas.DataFrame``
        Data frame of stats on cell filtration.

    Examples
    --------
    >>> df = pg.get_filter_stats(data)
    """

    # cell stats
    if isinstance(data, MultimodalData):
        data = data.current_data()

    if "Channel" not in data.obs:
        data.obs["Channel"] = pd.Categorical([""] * data.shape[0])

    df = data.obs[data.obs["n_genes"] >= min_genes_before_filt] if data.obs[
        "n_genes"].min() == 0 else data.obs
    gb1 = df.groupby("Channel")
    df_before = gb1.median()
    df_before = df_before.assign(total=gb1.size())
    df_before.rename(
        columns={
            "n_genes": "median_n_genes_before",
            "n_counts": "median_n_umis_before",
            "percent_mito": "median_percent_mito_before",
        },
        inplace=True,
    )

    # focusing only on filtered cells
    gb2 = data.obs[data.obs["passed_qc"]].groupby("Channel")
    df_after = gb2.median()
    df_after = df_after.assign(kept=gb2.size())
    df_after.rename(
        columns={
            "n_genes": "median_n_genes",
            "n_counts": "median_n_umis",
            "percent_mito": "median_percent_mito",
        },
        inplace=True,
    )
    df_cells = pd.concat((df_before, df_after), axis=1, sort=False)
    df_cells.fillna(0, inplace=True)
    df_cells["kept"] = df_cells["kept"].astype(int)
    df_cells["filt"] = df_cells["total"] - df_cells["kept"]

    target_cols = np.array([
        "kept", "median_n_genes", "median_n_umis", "median_percent_mito",
        "filt", "total", "median_n_genes_before", "median_n_umis_before",
        "median_percent_mito_before"
    ])
    target_cols = target_cols[np.isin(target_cols, df_cells.columns)]
    df_cells = df_cells[target_cols]
    df_cells.sort_values("kept", inplace=True)

    return df_cells
def qc_metrics(
    data: MultimodalData,
    select_singlets: bool = False,
    remap_string: str = None,
    subset_string: str = None,
    min_genes: int = None,
    max_genes: int = None,
    min_umis: int = None,
    max_umis: int = None,
    mito_prefix: str = None,
    percent_mito: float = None,
) -> None:
    """Generate Quality Control (QC) metrics regarding cell barcodes on the dataset.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
       Use current selected modality in data, which should contain one RNA expression matrix.
    select_singlets: ``bool``, optional, default ``False``
        If select only singlets.
    remap_string: ``str``, optional, default ``None``
        Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
    subset_string: ``str``, optional, default ``None``
        If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
    min_genes: ``int``, optional, default: ``None``
       Only keep cells with at least ``min_genes`` genes.
    max_genes: ``int``, optional, default: ``None``
       Only keep cells with less than ``max_genes`` genes.
    min_umis: ``int``, optional, default: ``None``
       Only keep cells with at least ``min_umis`` UMIs.
    max_umis: ``int``, optional, default: ``None``
       Only keep cells with less than ``max_umis`` UMIs.
    mito_prefix: ``str``, optional, default: ``None``
       Prefix for mitochondrial genes.
    percent_mito: ``float``, optional, default: ``None``
       Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts.

    Returns
    -------
    ``None``

    Update ``data.obs``:

        * ``n_genes``: Total number of genes for each cell.
        * ``n_counts``: Total number of counts for each cell.
        * ``percent_mito``: Percent of mitochondrial genes for each cell.
        * ``passed_qc``: Boolean type indicating if a cell passes the QC process based on the QC metrics.
        * ``demux_type``: this column might be deleted if select_singlets is on.

    Examples
    --------
    >>> pg.qc_metrics(data, min_genes=500, max_genes=6000, mito_prefix="MT-", percent_mito=10)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()
    calc_qc_filters(data,
                    select_singlets=select_singlets,
                    remap_string=remap_string,
                    subset_string=subset_string,
                    min_genes=min_genes,
                    max_genes=max_genes,
                    min_umis=min_umis,
                    max_umis=max_umis,
                    mito_prefix=mito_prefix,
                    percent_mito=percent_mito)