def log_norm( data: MultimodalData, norm_count: float = 1e5, backup_matrix: str = "raw.X", ) -> None: """Normalization, and then apply natural logarithm to the data. Parameters ---------- data: ``pegasusio.MultimodalData`` Use current selected modality in data, which should contain one RNA expression matrix. norm_count: ``int``, optional, default: ``1e5``. Total counts of one cell after normalization. backup_matrix: ``str``, optional, default: ``raw.X``. The key name of the backup count matrix, usually the raw counts. Returns ------- ``None`` Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``. In case of rerunning normalization while ``backup_matrix`` already exists, use ``backup_matrix`` instead of ``data.X`` for normalization. Examples -------- >>> pg.log_norm(data) """ if isinstance(data, MultimodalData): data = data.current_data() assert data.get_modality() == "rna" if backup_matrix not in data.list_keys(): data.add_matrix(backup_matrix, data.X) data.X = data.X.astype(np.float32) # force copy else: # The case of rerunning log_norm. Use backup matrix as source. data.X = data.get_matrix(backup_matrix).astype( np.float32) # force copy logger.warning( "Rerun log-normalization. Use the raw counts in backup instead.") data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values, norm_count, True) data.uns["norm_count"] = norm_count
def identify_robust_genes(data: MultimodalData, percent_cells: float = 0.05) -> None: """ Identify robust genes as candidates for HVG selection and remove genes that are not expressed in any cells. Parameters ---------- data: ``pegasusio.MultimodalData`` Use current selected modality in data, which should contain one RNA expression matrix. percent_cells: ``float``, optional, default: ``0.05`` Only assign genes to be ``robust`` that are expressed in at least ``percent_cells`` % of cells. Returns ------- ``None`` Update ``data.var``: * ``n_cells``: Total number of cells in which each gene is measured. * ``percent_cells``: Percent of cells in which each gene is measured. * ``robust``: Boolean type indicating if a gene is robust based on the QC metrics. * ``highly_variable_features``: Boolean type indicating if a gene is a highly variable feature. By default, set all robust genes as highly variable features. Examples -------- >>> pg.identify_robust_genes(data, percent_cells = 0.05) """ if isinstance(data, MultimodalData): data = data.current_data() prior_n = data.shape[1] if issparse(data.X): data.var["n_cells"] = data.X.getnnz(axis=0) data._inplace_subset_var(data.var["n_cells"] > 0) data.var["percent_cells"] = (data.var["n_cells"] / data.shape[0]) * 100 data.var["robust"] = data.var["percent_cells"] >= percent_cells else: data.var["robust"] = True data.var["highly_variable_features"] = data.var[ "robust"] # default all robust genes are "highly" variable logger.info( f"After filtration, {data.shape[1]}/{prior_n} genes are kept. Among {data.shape[1]} genes, {data.var['robust'].sum()} genes are robust." )
def log_norm(data: MultimodalData, norm_count: float = 1e5, backup_matrix: str = "raw.X") -> None: """Normalization, and then apply natural logarithm to the data. Parameters ---------- data: ``pegasusio.MultimodalData`` Use current selected modality in data, which should contain one RNA expression matrix. norm_count: ``int``, optional, default: ``1e5``. Total count of cells after normalization. backup_matrix: ``str``, optional, default: ``raw.X``. Where to back up the count matrix. Returns ------- ``None`` Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``. Examples -------- >>> pg.log_norm(data) """ if isinstance(data, MultimodalData): data = data.current_data() assert data.get_modality() == "rna" data.add_matrix(backup_matrix, data.X) data.X = data.X.astype(np.float32) # force copy from pegasus.cylib.fast_utils import normalize_by_count data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values, norm_count, True) data.uns["norm_count"] = norm_count
def get_filter_stats(data: MultimodalData, min_genes_before_filt: int = 100) -> pd.DataFrame: """Calculate filtration stats on cell barcodes. Parameters ---------- data: ``pegasusio.MultimodalData`` Use current selected modality in data, which should contain one RNA expression matrix. min_genes_before_filt: ``int``, optional, default ``100`` If raw data matrix is input, empty barcodes will dominate pre-filtration statistics. To avoid this, for raw matrix, only consider barcodes with at least <number> genes for pre-filtration condition. Returns ------- df_cells: ``pandas.DataFrame`` Data frame of stats on cell filtration. Examples -------- >>> df = pg.get_filter_stats(data) """ # cell stats if isinstance(data, MultimodalData): data = data.current_data() if "Channel" not in data.obs: data.obs["Channel"] = pd.Categorical([""] * data.shape[0]) df = data.obs[data.obs["n_genes"] >= min_genes_before_filt] if data.obs[ "n_genes"].min() == 0 else data.obs gb1 = df.groupby("Channel") df_before = gb1.median() df_before = df_before.assign(total=gb1.size()) df_before.rename( columns={ "n_genes": "median_n_genes_before", "n_counts": "median_n_umis_before", "percent_mito": "median_percent_mito_before", }, inplace=True, ) # focusing only on filtered cells gb2 = data.obs[data.obs["passed_qc"]].groupby("Channel") df_after = gb2.median() df_after = df_after.assign(kept=gb2.size()) df_after.rename( columns={ "n_genes": "median_n_genes", "n_counts": "median_n_umis", "percent_mito": "median_percent_mito", }, inplace=True, ) df_cells = pd.concat((df_before, df_after), axis=1, sort=False) df_cells.fillna(0, inplace=True) df_cells["kept"] = df_cells["kept"].astype(int) df_cells["filt"] = df_cells["total"] - df_cells["kept"] target_cols = np.array([ "kept", "median_n_genes", "median_n_umis", "median_percent_mito", "filt", "total", "median_n_genes_before", "median_n_umis_before", "median_percent_mito_before" ]) target_cols = target_cols[np.isin(target_cols, df_cells.columns)] df_cells = df_cells[target_cols] df_cells.sort_values("kept", inplace=True) return df_cells
def qc_metrics( data: MultimodalData, select_singlets: bool = False, remap_string: str = None, subset_string: str = None, min_genes: int = None, max_genes: int = None, min_umis: int = None, max_umis: int = None, mito_prefix: str = None, percent_mito: float = None, ) -> None: """Generate Quality Control (QC) metrics regarding cell barcodes on the dataset. Parameters ---------- data: ``pegasusio.MultimodalData`` Use current selected modality in data, which should contain one RNA expression matrix. select_singlets: ``bool``, optional, default ``False`` If select only singlets. remap_string: ``str``, optional, default ``None`` Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'. subset_string: ``str``, optional, default ``None`` If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3". min_genes: ``int``, optional, default: ``None`` Only keep cells with at least ``min_genes`` genes. max_genes: ``int``, optional, default: ``None`` Only keep cells with less than ``max_genes`` genes. min_umis: ``int``, optional, default: ``None`` Only keep cells with at least ``min_umis`` UMIs. max_umis: ``int``, optional, default: ``None`` Only keep cells with less than ``max_umis`` UMIs. mito_prefix: ``str``, optional, default: ``None`` Prefix for mitochondrial genes. percent_mito: ``float``, optional, default: ``None`` Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Returns ------- ``None`` Update ``data.obs``: * ``n_genes``: Total number of genes for each cell. * ``n_counts``: Total number of counts for each cell. * ``percent_mito``: Percent of mitochondrial genes for each cell. * ``passed_qc``: Boolean type indicating if a cell passes the QC process based on the QC metrics. * ``demux_type``: this column might be deleted if select_singlets is on. Examples -------- >>> pg.qc_metrics(data, min_genes=500, max_genes=6000, mito_prefix="MT-", percent_mito=10) """ if isinstance(data, MultimodalData): data = data.current_data() calc_qc_filters(data, select_singlets=select_singlets, remap_string=remap_string, subset_string=subset_string, min_genes=min_genes, max_genes=max_genes, min_umis=min_umis, max_umis=max_umis, mito_prefix=mito_prefix, percent_mito=percent_mito)