def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, max_proportion_per_cell=None, use_initial_size=True, layers=['spliced', 'unspliced'], enforce=False, copy=False): """Normalize each cell by total counts over all genes. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. max_proportion_per_cell : `int` (default: `None`) Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05. use_initial_size : `bool` (default: `True`) Whether to use initial cell sizes oder actual cell sizes. layers : `str` or `list` (default: `{'spliced', 'unspliced'}`) Keys for layers to be also considered for normalization. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. """ adata = data.copy() if copy else data layers = adata.layers.keys() if layers is 'all' else [layers] if isinstance(layers, str) \ else [layer for layer in layers if layer in adata.layers.keys()] layers = ['X'] + layers modified_layers = [] for layer in layers: X = adata.X if layer is 'X' else adata.layers[layer] if not_yet_normalized(X) or enforce: counts = counts_per_cell if counts_per_cell is not None \ else get_initial_size(adata, layer) if use_initial_size else get_size(adata, layer) if max_proportion_per_cell is not None and (0 < max_proportion_per_cell < 1): counts = counts_per_cell_quantile(X, max_proportion_per_cell, counts) # equivalent to scanpy.pp.normalize_per_cell(X, counts_per_cell_after, counts) counts_after = np.median(counts) if counts_per_cell_after is None else counts_per_cell_after counts /= counts_after + (counts_after == 0) counts += counts == 0 # to avoid division by zero if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) else: X /= np.array(counts[:, None]) modified_layers.append(layer) adata.obs['n_counts' if key_n_counts is None else key_n_counts] = get_size(adata) if len(modified_layers) > 0: logg.info('Normalized count data:', ', '.join(modified_layers) + '.') return adata if copy else None
def normalize_knn_graph(knn): """normalize the knn graph so that each row will be sum up to 1.""" knn.setdiag(1) knn = knn.astype("float32") sparsefuncs.inplace_row_scale(knn, 1 / knn.sum(axis=1).A1) return knn
def normalize_by_idf(matrix): numbcs_per_feature = matrix.get_numbcs_per_feature() scaling_factors_row = np.log(matrix.bcs_dim + 1) - np.log(1 + numbcs_per_feature) m = matrix.m.copy().astype(np.float64) sparsefuncs.inplace_row_scale(m, scaling_factors_row) return m
def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X after = np.median(counts[counts > 0]) if after is None else after counts += (counts == 0) counts /= after if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) else: X /= counts[:, None] return X if copy else None
def normalize_per_cell(data, counts_per_cell_after=None, copy=False, counts_per_cell=None, field_name_counts=None): """Normalize each cell. Normalize each cell by UMI count, so that every cell has the same total count. Similar functions are used, for example, by Cell Ranger [Zheng17], Seurat [Satija15], or SPRING [Weinreb17]. Parameters ---------- data : array_like, sparse or AnnData Data matrix. Rows correspond to cells and columns to genes. counts_per_cell_after : float or None (default: None) If None, after normalization, each cell has a total count equal to the median of the counts_per_cell before normalization. counts_per_cell : array (default: None) Precomputed counts per cell. copy : bool (default: False) Determines whether function operates inplace (default) or a copy is returned. Returns ------- Returns or updates ``adata`` with normalized version of the original ``adata.X``, depending on `copy`. """ if field_name_counts is None: field_name_counts = 'n_counts' if isinstance(data, AnnData): logg.m('normalizing by total count per cell', r=True) adata = data.copy() if copy else data cell_subset, counts_per_cell = filter_cells(adata.X, min_counts=1) adata.smp[field_name_counts] = counts_per_cell adata.inplace_subset_smp(cell_subset) normalize_per_cell(adata.X, counts_per_cell_after, copy, counts_per_cell=counts_per_cell[cell_subset]) logg.m(' finished', t=True, end=' ') logg.m('normalized adata.X and added', no_indent=True) logg.m(' "{}", counts per cell before normalization (adata.smp)' .format(field_name_counts), no_indent=True) return adata if copy else None # proceed with data matrix X = data.copy() if copy else data if counts_per_cell is None: cell_subset, counts_per_cell = filter_cells(X, min_counts=1) X = X[cell_subset] counts_per_cell = counts_per_cell[cell_subset] if counts_per_cell_after is None: counts_per_cell_after = np.median(counts_per_cell) counts_per_cell /= counts_per_cell_after if not issparse(X): X /= counts_per_cell[:, np.newaxis] else: sparsefuncs.inplace_row_scale(X, 1/counts_per_cell) return X if copy else None
def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X if issubclass(X.dtype.type, (int, np.integer)): X = X.astype(np.float32) # TODO: Check if float64 should be used counts = np.asarray(counts) # dask doesn't do medians after = np.median(counts[counts > 0], axis=0) if after is None else after counts += (counts == 0) counts = counts / after if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) else: np.divide(X, counts[:, None], out=X) return X
def test_inplace_row_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(100) XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
def ClusterSpecificGenes( adata, genes, obs ): #Use obs = 'Type_num' for atlas and obs = 'Type_iGB' for all other time points all_var_genes = genes percentages = np.zeros( (len(all_var_genes), len(adata.obs[obs].values.categories))) all_var_genes_index = [] for i in all_var_genes: all_var_genes_index.append(np.where(adata.var.index.values == i)[0][0]) clusters = list(adata.obs[obs].values.categories) for index, value in enumerate(clusters): cells_in_clust = adata.obs.index[adata.obs[obs].values == value] cells_in_clust_index = [] for i in cells_in_clust: cells_in_clust_index.append( np.where(adata.obs.index.values == i)[0][0]) percentages[:, index] = adata.layers['raw'][ cells_in_clust_index, :][:, all_var_genes_index].getnnz( axis=0) / len(cells_in_clust_index) var_genes = [] var_genes_index = [] for i in range(len(all_var_genes_index)): if any(i > 0.3 for i in percentages[i, :]) == True: var_genes.append(all_var_genes[i]) var_genes_index.append(all_var_genes_index[i]) X = adata.layers['raw'].copy() counts_per_cell = X.sum(1) counts_per_cell = np.ravel(counts_per_cell) counts = np.asarray(counts_per_cell) after = np.median(counts[counts > 0], axis=0) counts += (counts == 0) counts = counts / after sparsefuncs.inplace_row_scale(X, 1 / counts) E = np.zeros((len(var_genes), len(adata.obs[obs].values.categories))) for index, value in enumerate(clusters): cells_in_clust = adata.obs.index[adata.obs[obs].values == value] cells_in_clust_index = [] for i in cells_in_clust: cells_in_clust_index.append( np.where(adata.obs.index.values == i)[0][0]) E[:, index] = np.log(X[cells_in_clust_index, :][:, var_genes_index].mean( axis=0) + 1) a = np.zeros(len(var_genes)) for i in range(len(a)): ranking_E = np.sort(E[i, :]) a[i] = np.mean(ranking_E[-7:]) / np.mean(ranking_E[:7]) to_return = list(np.array(var_genes)[a > 8]) return to_return
def normalize_and_log_the_raw_matrix( adata, counts_per_cell_after = 1e4 ): ''' ''' if check_if_raw_matrix_is_logged( adata ): print('normalize_and_log_the_raw_matrix:: matrix is already logged') return adata print('Normalize and logging matrix...') ft_varname = pmhc_scoring.get_feature_types_varname(adata) if ft_varname: ngenes = sum( adata.raw.var[ft_varname] != 'Antibody Capture' ) else: ngenes = adata.raw.shape[1] X_gex = adata.raw.X[:,:ngenes] X_ab = adata.raw.X[:,ngenes:] counts_per_cell = np.sum( X_gex, axis=1 ).A1 # A1 since X_gex is sparse assert np.min( counts_per_cell ) > 0 if np.median( counts_per_cell ) < 100: print('WARNING normalize_and_log_the_raw_matrix: low median counts_per_cell.', np.median(counts_per_cell),'\n', 'has the matrix already been log1p-ed???') exit() counts_per_cell /= counts_per_cell_after sparsefuncs.inplace_row_scale(X_gex, 1/counts_per_cell) new_counts_per_cell = np.sum( X_gex, axis=1 ).A1 # A1 since X_gex is sparse assert min(new_counts_per_cell) > counts_per_cell_after-1 and max(new_counts_per_cell) < counts_per_cell_after+1 new_X = scipy.sparse.hstack( [X_gex, X_ab], format="csr" ) np.log1p( new_X.data, out = new_X.data ) adata_new = AnnData( X = new_X, obs = adata.obs, var = adata.raw.var ) adata.raw = adata_new set_raw_matrix_is_logged_to_true( adata ) #print(adata) return adata
def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X if issubclass(X.dtype.type, (int, np.integer)): X = X.astype(np.float32) # TODO: Check if float64 should be used if isinstance(counts, DaskArray): counts_greater_than_zero = counts[counts > 0].compute_chunk_sizes() else: counts_greater_than_zero = counts[counts > 0] after = np.median(counts_greater_than_zero, axis=0) if after is None else after counts += counts == 0 counts = counts / after if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) elif isinstance(counts, np.ndarray): np.divide(X, counts[:, None], out=X) else: X = np.divide(X, counts[:, None]) # dask does not support kwarg "out" return X
def normalize_by_idf(matrix): numbcs_per_feature = matrix.get_numbcs_per_feature() scaling_factors_row = np.log(matrix.bcs_dim + 1) - np.log(1 + numbcs_per_feature) m = matrix.m.copy().astype(np.float64) sparsefuncs.inplace_row_scale(m, scaling_factors_row) # Extremely Rare Case (1 out of 1000s of samples tested): # Either the scaling or the count may be zero for all features for some barcode # This would lead to zero-ing out entire barcode upon normalization, which leads to a null projection as well. # This is harmful to analysis code that depends on at least a non-zero norm for each barcode (e.g. spherical clustering and normalized tsne) # We sprinkle in a small value that ensures an nnz for the all-zero barcode, after finding such barcodes. # find zeroed barcodes and assign nnz to first feature (these barcodes are indistinguishable anyway) # We run the very small risk of making it similar to another barcode that is also nnz in the first feature only zeroed = np.where(np.squeeze(np.asarray(m.sum(axis=0))) == 0) for bc_ix in zeroed: m[0, bc_ix] = 1e-15 return m
def fit(self, urm): """ Train the recommender with a list of known interactions playlist - track :param urm: the user rating matrix """ print('Training Top Pop Followers...') self.urm = urm # Remove duplicates self.urm.data = np.ones(len(self.urm.data)) # Normalize self.followers = normalize(self.followers.reshape( self.followers.shape[0], -1), norm='l2', axis=0).reshape(self.followers.shape) # Scale urm according to followers inplace_row_scale(urm, self.followers) self.track_weighted = urm.sum(axis=0) self.track_weighted = np.squeeze(np.asarray(self.track_weighted)) self.popular_tracks = np.argsort(self.track_weighted)[::-1][:10000]
def __apply_tf__(icm): if tf_type == 'none' or tf_type =='raw' or tf_type=='': pass elif tf_type == 'binary': icm.data = ones(len(icm.data)) elif tf_type=='tf_normal': skfun.inplace_row_scale(icm, 1/nt_dupli) elif tf_type=='tf_duplicates': skfun.inplace_row_scale(icm, 1/nt) elif tf_type=='tf_elduplicates': icm.data = ones(len(icm.data)) skfun.inplace_row_scale(icm, 1/nt) elif tf_type== 'log': icm.data += ones(len(icm.data)) icm.data = log(icm.data) elif tf_type =='double_k': max_per_playlist = np.maximum.reduceat(icm.data, icm.indptr[:-1]) max_per_playlist[np.diff(icm.indptr) == 0] = 0 skfun.inplace_row_scale(icm, k / max_per_playlist) icm.data = k + icm.data else: raise AttributeError("n***a wut? idf ["+tf_type+"] not found")
def kernels_from_velocyto_scvelo( X, X_embedding, V, indices, neg_cells_trick, xy_grid_nums, kernel="pearson", n_recurse_neighbors=2, max_neighs=None, transform="sqrt", use_neg_vals=True, correct_density=True, ): """utility function for calculating the transition matrix and low dimensional velocity embedding via the original pearson correlation kernel (La Manno et al., 2018) or the cosine kernel from scVelo (Bergen et al., 2019).""" n = X.shape[0] if indices is not None: rows = [] cols = [] vals = [] delta_X = np.zeros((n, X_embedding.shape[1])) for i in LoggerManager.progress_logger( range(n), progress_name=f"calculating transition matrix via {kernel} kernel with {transform} transform.", ): velocity = V[i, :] # project V to pca space if velocity.sum() != 0: i_vals = get_iterative_indices(indices, i, n_recurse_neighbors, max_neighs) # np.zeros((knn, 1)) diff = X[i_vals, :] - X[i, :] if transform == "log": diff_velocity = np.sign(velocity) * np.log1p(np.abs(velocity)) diff_rho = np.sign(diff) * np.log1p(np.abs(diff)) elif transform == "logratio": hi_dim, hi_dim_t = X[i, :], X[i, :] + velocity log2hidim = np.log1p(np.abs(hi_dim)) diff_velocity = np.log1p(np.abs(hi_dim_t)) - log2hidim diff_rho = np.log1p(np.abs(X[i_vals, :])) - np.log1p(np.abs(hi_dim)) elif transform == "linear": diff_velocity = velocity diff_rho = diff elif transform == "sqrt": diff_velocity = np.sign(velocity) * np.sqrt(np.abs(velocity)) diff_rho = np.sign(diff) * np.sqrt(np.abs(diff)) if kernel == "pearson": vals_ = einsum_correlation(diff_rho, diff_velocity, type="pearson") elif kernel == "cosine": vals_ = einsum_correlation(diff_rho, diff_velocity, type="cosine") rows.extend([i] * len(i_vals)) cols.extend(i_vals) vals.extend(vals_) vals = np.hstack(vals) vals[np.isnan(vals)] = 0 G = sp.csr_matrix((vals, (rows, cols)), shape=(X_embedding.shape[0], X_embedding.shape[0])) G = split_velocity_graph(G, neg_cells_trick) if neg_cells_trick: G, G_ = G confidence, ub_confidence = G.max(1).A.flatten(), np.percentile(G.max(1).A.flatten(), 98) dig_p = np.clip(ub_confidence - confidence, 0, 1) G.setdiag(dig_p) T = np.expm1(G / 0.1) if neg_cells_trick: if use_neg_vals: T -= np.expm1(-G_ / 0.1) else: T += np.expm1(G_ / 0.1) T.data = T.data + 1 # T = w * (~ direct_neighs).multiply(T) + (1 - w) * direct_neighs.multiply(T) # normalize so that each row sum up to 1 sparsefuncs.inplace_row_scale(T, 1 / np.abs(T).sum(axis=1).A1) T.setdiag(0) T.eliminate_zeros() delta_X = projection_with_transition_matrix(n, T, X_embedding, correct_density) X_grid, V_grid, D = velocity_on_grid( X_embedding[:, :2], (X_embedding + delta_X)[:, :2], xy_grid_nums=xy_grid_nums, ) return T, delta_X, X_grid, V_grid, D
def normalize_per_cell( data: Union[AnnData, np.ndarray, spmatrix], counts_per_cell_after: Optional[float] = None, counts_per_cell: Optional[np.ndarray] = None, key_n_counts: str = 'n_counts', copy: bool = False, layers: Union[Literal['all'], Iterable[str]] = (), use_rep: Optional[Literal['after', 'X']] = None, min_counts: int = 1, ) -> Optional[AnnData]: """\ Normalize total counts per cell. .. warning:: .. deprecated:: 1.3.7 Use :func:`~scanpy.pp.normalize_total` instead. The new function is equivalent to the present function, except that * the new function doesn't filter cells based on `min_counts`, use :func:`~scanpy.pp.filter_cells` if filtering is needed. * some arguments were renamed * `copy` is replaced by `inplace` Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger [Zheng17]_ or SPRING [Weinreb17]_. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell Precomputed counts per cell. key_n_counts Name of the field in `adata.obs` where the total counts per cell are stored. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. min_counts Cells with counts less than `min_counts` are filtered out during normalization. Returns ------- Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. Examples -------- >>> import scanpy as sc >>> adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]])) >>> print(adata.X.sum(axis=1)) [ 1. 3. 11.] >>> sc.pp.normalize_per_cell(adata) >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts 0 1.0 1 3.0 2 11.0 [ 3. 3. 3.] >>> sc.pp.normalize_per_cell( >>> adata, counts_per_cell_after=1, >>> key_n_counts='n_counts2', >>> ) >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts n_counts2 0 1.0 3.0 1 3.0 3.0 2 11.0 3.0 [ 1. 1. 1.] """ if isinstance(data, AnnData): start = logg.info('normalizing by total count per cell') adata = data.copy() if copy else data if counts_per_cell is None: cell_subset, counts_per_cell = materialize_as_ndarray( filter_cells(adata.X, min_counts=min_counts)) adata.obs[key_n_counts] = counts_per_cell adata._inplace_subset_obs(cell_subset) counts_per_cell = counts_per_cell[cell_subset] normalize_per_cell(adata.X, counts_per_cell_after, counts_per_cell) layers = adata.layers.keys() if layers == 'all' else layers if use_rep == 'after': after = counts_per_cell_after elif use_rep == 'X': after = np.median(counts_per_cell[cell_subset]) elif use_rep is None: after = None else: raise ValueError('use_rep should be "after", "X" or None') for layer in layers: subset, counts = filter_cells(adata.layers[layer], min_counts=min_counts) temp = normalize_per_cell(adata.layers[layer], after, counts, copy=True) adata.layers[layer] = temp logg.info( ' finished ({time_passed}): normalized adata.X and added' f' {key_n_counts!r}, counts per cell before normalization (adata.obs)', time=start, ) return adata if copy else None # proceed with data matrix X = data.copy() if copy else data if counts_per_cell is None: if copy == False: raise ValueError('Can only be run with copy=True') cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts) X = X[cell_subset] counts_per_cell = counts_per_cell[cell_subset] if counts_per_cell_after is None: counts_per_cell_after = np.median(counts_per_cell) with warnings.catch_warnings(): warnings.simplefilter("ignore") counts_per_cell += counts_per_cell == 0 counts_per_cell /= counts_per_cell_after if not issparse(X): X /= materialize_as_ndarray(counts_per_cell[:, np.newaxis]) else: sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell) return X if copy else None
def lambda_correction( adata: anndata.AnnData, lambda_key: str = "lambda", inplace: bool = True, copy: bool = False, ) -> Union[anndata.AnnData, None]: """Use lambda (cell-wise detection rate) to estimate the labelled RNA. Parameters ---------- adata: adata object generated from dynast. lambda_key: The key to the cell-wise detection rate. inplace: Whether to inplace update the layers. If False, new layers that append '_corrected" to the existing will be used to store the updated data. copy: Whether to copy the adata object or update adata object inplace. Returns ------- adata: :class:`~anndata.AnnData` An new or updated anndata object, based on copy parameter, that are updated with Size_Factor, normalized expression values, X and reduced dimensions, etc. """ logger = LoggerManager.gen_logger("dynamo-lambda_correction") logger.log_time() adata = copy_adata(adata) if copy else adata logger.info("apply detection rate correction to adata...", indent_level=1) if lambda_key not in adata.obs.keys(): raise ValueError( f"the lambda_key {lambda_key} is not included in adata.obs! Please ensure you have calculated " "per-cell detection rate!" ) logger.info("retrieving the cell-wise detection rate..", indent_level=1) detection_rate = adata.obs[lambda_key].values[:, None] logger.info("identify the data type..", indent_level=1) all_layers = adata.layers.keys() has_ul = np.any([i.contains("ul_") for i in all_layers]) has_un = np.any([i.contains("un_") for i in all_layers]) has_sl = np.any([i.contains("sl_") for i in all_layers]) has_sn = np.any([i.contains("sn_") for i in all_layers]) has_l = np.any([i.contains("_l_") for i in all_layers]) has_n = np.any([i.contains("_n_") for i in all_layers]) if sum(has_ul + has_un + has_sl + has_sn) == 4: datatype = "splicing_labeling" elif sum(has_l + has_n): datatype = "labeling" logger.info(f"the data type identified is {datatype}", indent_level=2) logger.info("retrieve relevant layers for detection rate correction", indent_level=1) if datatype == "splicing_labeling": layers, match_tot_layer = [], [] for layer in all_layers: if "ul_" in layer: layers += layer match_tot_layer += "unspliced" elif "un_" in layer: layers += layer match_tot_layer += "unspliced" elif "sl_" in layer: layers += layer match_tot_layer += "spliced" elif "sn_" in layer: layers += layer match_tot_layer += "spliced" elif "spliced" in layer: layers += layer elif "unspliced" in layer: layers += layer if len(layers) != 6: raise ValueError( "the adata object has to include ul, un, sl, sn, unspliced, spliced, " "six relevant layers for splicing and labeling quantified datasets." ) elif datatype == "labeling": layers, match_tot_layer = [], [] for layer in all_layers: if "_l_" in layer: layers += layer match_tot_layer += ["total"] elif "_n_" in layer: layers += layer match_tot_layer += ["total"] elif "total" in layer: layers += layer if len(layers) != 3: raise ValueError( "the adata object has to include labeled, unlabeled, three relevant layers for labeling quantified " "datasets." ) logger.info("detection rate correction starts", indent_level=1) for i, layer in enumerate(main_tqdm(layers, desc="iterating all relevant layers")): if i < len(match_tot_layer): cur_layer = adata.layers[layer] if inplace else adata.layers[layer].copy() cur_total = adata.layers[match_tot_layer[i]] # even layers is labeled RNA and odd unlabeled RNA if i % 2 == 0: # formula: min(L / lambda, (L + U)) from scNT-seq if issparse(cur_layer): sparsefuncs.inplace_row_scale(cur_layer, 1 / detection_rate) else: cur_layer /= detection_rate if inplace: adata.layers[layer] = sparse_mimmax(cur_layer, cur_total) else: adata.layers[layer + "_corrected"] = sparse_mimmax(cur_layer, cur_total) else: if inplace: adata.layers[layer] = cur_total - adata.layers[layer[i - 1]] else: adata.layers[layer + "_corrected"] = cur_total - adata.layers[layer[i - 1]] logger.finish_progress(progress_name="lambda_correction") if copy: return adata return None
def inplace_row_scale(self, scale): sparsefuncs.inplace_row_scale(self.value, scale) return self
def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, copy=False): """Normalize total counts per cell. Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger [Zheng17]_ or SPRING [Weinreb17]_. Parameters ---------- data : :class:`~scanpy.api.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. copy : `bool`, optional (default: `False`) If an :class:`~scanpy.api.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. Examples -------- >>> adata = AnnData( >>> data=np.array([[1, 0], [3, 0], [5, 6]])) >>> print(adata.X.sum(axis=1)) [ 1. 3. 11.] >>> sc.pp.normalize_per_cell(adata) >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts 0 1.0 1 3.0 2 11.0 [ 3. 3. 3.] >>> sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, >>> key_n_counts='n_counts2') >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts n_counts2 0 1.0 3.0 1 3.0 3.0 2 11.0 3.0 [ 1. 1. 1.] """ if key_n_counts is None: key_n_counts = 'n_counts' if isinstance(data, AnnData): logg.msg('normalizing by total count per cell', r=True) adata = data.copy() if copy else data cell_subset, counts_per_cell = filter_cells(adata.X, min_counts=1) adata.obs[key_n_counts] = counts_per_cell adata._inplace_subset_obs(cell_subset) normalize_per_cell(adata.X, counts_per_cell_after, counts_per_cell=counts_per_cell[cell_subset]) logg.msg(' finished', t=True, end=': ') logg.msg('normalized adata.X and added', no_indent=True) logg.msg( ' \'{}\', counts per cell before normalization (adata.obs)'. format(key_n_counts)) return adata if copy else None # proceed with data matrix X = data.copy() if copy else data if counts_per_cell is None: cell_subset, counts_per_cell = filter_cells(X, min_counts=1) X = X[cell_subset] counts_per_cell = counts_per_cell[cell_subset] if counts_per_cell_after is None: counts_per_cell_after = np.median(counts_per_cell) counts_per_cell /= counts_per_cell_after if not issparse(X): X /= counts_per_cell[:, np.newaxis] else: sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell) return X if copy else None
def SVRs(adata, filter_bool=None, layers='X', total_szfactor=None, min_expr_cells=2, min_expr_avg=0, max_expr_avg=20, svr_gamma=None, winsorize=False, winsor_perc=(1, 99.5), sort_inverse=False): """This function is modified from https://github.com/velocyto-team/velocyto.py/blob/master/velocyto/analysis.py Parameters ---------- adata: :class:`~anndata.AnnData` AnnData object. filter_bool: :class:`~numpy.ndarray` (default: None) A boolean array from the user to select cells for downstream analysis. layers: `str` (default: 'X') The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Default is X if there is no spliced layers. total_szfactor: `str` (default: `None`) The column name in the .obs attribute that corresponds to the size factor for the total mRNA. min_expr_cells: `int` (default: `2`) minimum number of cells that express that gene for it to be considered in the fit. min_expr_avg: `int` (default: `0`) The minimum average of genes across cells accepted. max_expr_avg: `float` (defaul: `20`) The maximum average of genes across cells accepted before treating house-keeping/outliers for removal. svr_gamma: `float` or None (default: `None`) the gamma hyper-parameter of the SVR. winsorize: `bool` (default: `False`) Wether to winsorize the data for the cv vs mean model. winsor_perc: `tuple` (default: `(1, 99.5)`) the up and lower bound of the winsorization. sort_inverse: `bool` (default: `False`) if True it sorts genes from less noisy to more noisy (to use for size estimation not for feature selection). Returns ------- adata: :class:`~anndata.AnnData` A updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute as a new key. """ from sklearn.svm import SVR layers = get_layer_keys(adata, layers) for layer in layers: if layer is 'raw': CM = adata.X.copy() if adata.raw is None else adata.raw szfactors = adata.obs[ layer + '_Size_Factor'][:, None] if adata.raw is not None else adata.obs[ 'Size_Factor'][:, None] elif layer is 'X': CM = adata.X.copy() szfactors = adata.obs['Size_Factor'][:, None] elif layer is 'protein': if 'protein' in adata.obsm_keys(): CM = adata.obsm['protein'].copy() szfactors = adata.obs[layer + '_Size_Factor'][:, None] else: continue else: CM = adata.layers[layer].copy() szfactors = adata.obs[layer + '_Size_Factor'][:, None] if total_szfactor is not None and total_szfactor in adata.obs.keys(): szfactors = adata.obs[total_szfactor][:, None] if issparse(CM): sparsefuncs.inplace_row_scale(CM, 1 / szfactors) else: CM /= 1 / szfactors if winsorize: if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01): min_expr_cells = int( np.ceil((100 - winsor_perc[1]) * CM.shape[1] * 0.01)) + 2 detected_bool = np.array(((CM > 0).sum(0) > min_expr_cells) & (CM.mean(0) < max_expr_avg) & (CM.mean(0) > min_expr_avg)).flatten() if filter_bool is not None: detected_bool = filter_bool & detected_bool valid_CM = CM[:, detected_bool] if winsorize: down, up = np.percentile(valid_CM.A, winsor_perc, 0) if issparse( valid_CM) else np.percentile(valid_CM, winsor_perc, 0) Sfw = np.clip(valid_CM.A, down[None, :], up[None, :]) if issparse( valid_CM) else np.percentile(valid_CM, winsor_perc, 0) mu = Sfw.mean(0) sigma = Sfw.std(0, ddof=1) else: mu = np.array(valid_CM.mean(0)).flatten() sigma = np.array( np.sqrt(valid_CM.multiply(valid_CM).mean(0).A1 - mu**2)).flatten() if issparse( valid_CM) else valid_CM.std(0, ddof=1) cv = sigma / mu log_m = np.array(np.log2(mu)).flatten() log_cv = np.array(np.log2(cv)).flatten() if svr_gamma is None: svr_gamma = 150. / len(mu) # Fit the Support Vector Regression clf = SVR(gamma=svr_gamma) clf.fit(log_m[:, None], log_cv) fitted_fun = clf.predict ff = fitted_fun(log_m[:, None]) score = log_cv - ff if sort_inverse: score = -score adata.var['log_m'], adata.var['log_cv'], adata.var[ 'score'] = np.nan, np.nan, -np.inf adata.var.loc[detected_bool, 'log_m'], adata.var.loc[ detected_bool, 'log_cv'], adata.var.loc[ detected_bool, 'score'] = np.array(log_m).flatten(), np.array( log_cv).flatten(), np.array(score).flatten() key = "velocyto_SVR" if layer is 'raw' or layer is 'X' else layer + "_velocyto_SVR" adata.uns[key] = {"SVR": fitted_fun, "detected_bool": detected_bool} adata
def normalize_per_cell( data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, max_proportion_per_cell=None, use_initial_size=True, layers=None, enforce=None, copy=False, ): """Normalize each cell by total counts over all genes. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. max_proportion_per_cell : `int` (default: `None`) Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05. use_initial_size : `bool` (default: `True`) Whether to use initial cell sizes oder actual cell sizes. layers : `str` or `list` (default: `['spliced', 'unspliced']`) Keys for layers to be also considered for normalization. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized counts. """ adata = data.copy() if copy else data if layers is None: layers = ["spliced", "unspliced"] elif layers == "all": layers = adata.layers.keys() elif isinstance(layers, str): layers = [layers] layers = ["X" ] + [layer for layer in layers if layer in adata.layers.keys()] modified_layers = [] if isinstance(counts_per_cell, str): if counts_per_cell not in adata.obs.keys(): _set_initial_size(adata, layers) counts_per_cell = (adata.obs[counts_per_cell].values if counts_per_cell in adata.obs.keys() else None) for layer in layers: check_if_valid_dtype(adata, layer) X = adata.X if layer == "X" else adata.layers[layer] if not_yet_normalized(X) or enforce: counts = (counts_per_cell if counts_per_cell is not None else _get_initial_size(adata, layer) if use_initial_size else _get_size(adata, layer)) if max_proportion_per_cell is not None and ( 0 < max_proportion_per_cell < 1): counts = counts_per_cell_quantile(X, max_proportion_per_cell, counts) # equivalent to sc.pp.normalize_per_cell(X, counts_per_cell_after, counts) counts_after = (np.median(counts) if counts_per_cell_after is None else counts_per_cell_after) counts_after += counts_after == 0 counts = counts / counts_after counts += counts == 0 # to avoid division by zero if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) else: X /= np.array(counts[:, None]) modified_layers.append(layer) if (layer == "X" and "gene_count_corr" not in adata.var.keys() and X.shape[-1] > 3e3): try: adata.var["gene_count_corr"] = np.round( csr_vcorrcoef(X.T, np.ravel((X > 0).sum(1))), 4) except Exception: pass else: logg.warn( f"Did not normalize {layer} as it looks processed already. " "To enforce normalization, set `enforce=True`.") adata.obs["n_counts" if key_n_counts is None else key_n_counts] = _get_size(adata) if len(modified_layers) > 0: logg.info("Normalized count data:", f"{', '.join(modified_layers)}.") return adata if copy else None
def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, copy=False, layers=[], use_rep=None, min_counts=1): """Normalize total counts per cell. Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger [Zheng17]_ or SPRING [Weinreb17]_. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. min_counts : `int`, optional (default: 1) Cells with counts less than `min_counts` are filtered out during normalization. Returns ------- AnnData, `None` Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. Examples -------- >>> adata = AnnData( >>> data=np.array([[1, 0], [3, 0], [5, 6]])) >>> print(adata.X.sum(axis=1)) [ 1. 3. 11.] >>> sc.pp.normalize_per_cell(adata) >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts 0 1.0 1 3.0 2 11.0 [ 3. 3. 3.] >>> sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, >>> key_n_counts='n_counts2') >>> print(adata.obs) >>> print(adata.X.sum(axis=1)) n_counts n_counts2 0 1.0 3.0 1 3.0 3.0 2 11.0 3.0 [ 1. 1. 1.] """ if key_n_counts is None: key_n_counts = 'n_counts' if isinstance(data, AnnData): logg.msg('normalizing by total count per cell', r=True) adata = data.copy() if copy else data cell_subset, counts_per_cell = materialize_as_ndarray( filter_cells(adata.X, min_counts=min_counts)) adata.obs[key_n_counts] = counts_per_cell adata._inplace_subset_obs(cell_subset) normalize_per_cell(adata.X, counts_per_cell_after, counts_per_cell=counts_per_cell[cell_subset]) layers = adata.layers.keys() if layers == 'all' else layers if use_rep == 'after': after = counts_per_cell_after elif use_rep == 'X': after = np.median(counts_per_cell[cell_subset]) elif use_rep is None: after = None else: raise ValueError('use_rep should be "after", "X" or None') for layer in layers: subset, counts = filter_cells(adata.layers[layer], min_counts=min_counts) temp = normalize_per_cell(adata.layers[layer], after, counts, copy=True) adata.layers[layer] = temp logg.msg(' finished', t=True, end=': ') logg.msg('normalized adata.X and added', no_indent=True) logg.msg( ' \'{}\', counts per cell before normalization (adata.obs)'. format(key_n_counts)) return adata if copy else None # proceed with data matrix X = data.copy() if copy else data if counts_per_cell is None: if copy == False: raise ValueError('Can only be run with copy=True') cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts) X = X[cell_subset] counts_per_cell = counts_per_cell[cell_subset] if counts_per_cell_after is None: counts_per_cell_after = np.median(counts_per_cell) with warnings.catch_warnings(): warnings.simplefilter("ignore") counts_per_cell += counts_per_cell == 0 counts_per_cell /= counts_per_cell_after if not issparse(X): X /= materialize_as_ndarray(counts_per_cell[:, np.newaxis]) else: sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell) return X if copy else None