def _highly_variable_genes_seurat_v3( adata: AnnData, layer: Optional[str] = None, n_top_genes: int = 2000, batch_key: Optional[str] = None, span: Optional[float] = 0.3, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ See `highly_variable_genes`. For further implemenation details see https://www.overleaf.com/read/ckptrbgzzzpg Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or updates `.var` with the following fields highly_variable : bool boolean indicator of highly-variable genes **means** means per gene **variances** variance per gene **variances_norm** normalized variance per gene, averaged in the case of multiple batches highly_variable_rank : float Rank of the gene according to normalized variance, median rank in the case of multiple batches highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG """ try: from skmisc.loess import loess except ImportError: raise ImportError( 'Please install skmisc package via `pip install --user scikit-misc' ) X = adata.layers[layer] if layer is not None else adata.X if check_nonnegative_integers(X) is False: raise ValueError( "`pp.highly_variable_genes` with `flavor='seurat_v3'` expects " "raw count data.") if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) else: batch_info = adata.obs[batch_key].values norm_gene_vars = [] for b in np.unique(batch_info): ad = adata[batch_info == b] X = ad.layers[layer] if layer is not None else ad.X mean, var = _get_mean_var(X) not_const = var > 0 estimat_var = np.zeros(adata.shape[1], dtype=np.float64) y = np.log10(var[not_const]) x = np.log10(mean[not_const]) model = loess(x, y, span=span, degree=2) model.fit() estimat_var[not_const] = model.outputs.fitted_values reg_std = np.sqrt(10**estimat_var) batch_counts = X.astype(np.float64).copy() # clip large values as in Seurat N = np.sum(batch_info == b) vmax = np.sqrt(N) clip_val = reg_std * vmax + mean if sp_sparse.issparse(batch_counts): batch_counts = sp_sparse.csr_matrix(batch_counts) mask = batch_counts.data > clip_val[batch_counts.indices] batch_counts.data[mask] = clip_val[batch_counts.indices[mask]] else: clip_val_broad = np.broadcast_to(clip_val, batch_counts.shape) np.putmask( batch_counts, batch_counts > clip_val_broad, clip_val_broad, ) if sp_sparse.issparse(batch_counts): squared_batch_counts_sum = np.array( batch_counts.power(2).sum(axis=0)) batch_counts_sum = np.array(batch_counts.sum(axis=0)) else: squared_batch_counts_sum = np.square(batch_counts).sum(axis=0) batch_counts_sum = batch_counts.sum(axis=0) norm_gene_var = (1 / ((N - 1) * np.square(reg_std))) * ( (N * np.square(mean)) + squared_batch_counts_sum - 2 * batch_counts_sum * mean) norm_gene_vars.append(norm_gene_var.reshape(1, -1)) norm_gene_vars = np.concatenate(norm_gene_vars, axis=0) # argsort twice gives ranks, small rank means most variable ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1) # this is done in SelectIntegrationFeatures() in Seurat v3 ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32) num_batches_high_var = np.sum( (ranked_norm_gene_vars < n_top_genes).astype(int), axis=0) ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) df = pd.DataFrame(index=np.array(adata.var_names)) df['highly_variable_nbatches'] = num_batches_high_var df['highly_variable_rank'] = median_ranked df['variances_norm'] = np.mean(norm_gene_vars, axis=0) df['means'] = mean df['variances'] = var df.sort_values( ['highly_variable_rank', 'highly_variable_nbatches'], ascending=[True, False], na_position='last', inplace=True, ) df['highly_variable'] = False df.loc[:int(n_top_genes), 'highly_variable'] = True df = df.loc[adata.var_names] if inplace or subset: adata.uns['hvg'] = {'flavor': 'seurat_v3'} logg.hint('added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'highly_variable_rank\', float vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'variances\', float vector (adata.var)\n' ' \'variances_norm\', float vector (adata.var)') adata.var['highly_variable'] = df['highly_variable'].values adata.var['highly_variable_rank'] = df['highly_variable_rank'].values adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values adata.var['variances_norm'] = df['variances_norm'].values.astype( 'float64', copy=False) if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: if batch_key is None: df = df.drop(['highly_variable_nbatches'], axis=1) return df
def highly_variable_genes( adata: AnnData, layer: Optional[str] = None, n_top_genes: Optional[int] = None, min_disp: Optional[float] = 0.5, max_disp: Optional[float] = np.inf, min_mean: Optional[float] = 0.0125, max_mean: Optional[float] = 3, span: Optional[float] = 0.3, n_bins: int = 20, flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, ) -> Optional[pd.DataFrame]: """\ Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_. Expects logarithmized data, except when `flavor='seurat_v3'` in which count data is expected. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_. For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. For [Stuart19]_, a normalized variance for each gene is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. Parameters ---------- adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. layer If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. n_top_genes Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`. min_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. max_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. min_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. max_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. span The fraction of the data (cells) used when estimating the variance in the loess model fit if `flavor='seurat_v3'`. n_bins Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. flavor Choose the flavor for identifying highly variable genes. For the dispersion based methods in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. subset Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. inplace Whether to place calculated metrics in `.var` or return them. batch_key If specified, highly-variable genes are selected within each batch separately and merged. This simple process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median (across batches) rank based on within-batch normalized variance. Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or updates `.var` with the following fields highly_variable : bool boolean indicator of highly-variable genes **means** means per gene **dispersions** For dispersion-based flavors, dispersions per gene **dispersions_norm** For dispersion-based flavors, normalized dispersions per gene **variances** For `flavor='seurat_v3'`, variance per gene **variances_norm** For `flavor='seurat_v3'`, normalized variance per gene, averaged in the case of multiple batches highly_variable_rank : float For `flavor='seurat_v3'`, rank of the gene according to normalized variance, median rank in the case of multiple batches highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG highly_variable_intersection : bool If batch_key is given, this denotes the genes that are highly variable in all batches Notes ----- This function replaces :func:`~scanpy.pp.filter_genes_dispersion`. """ if n_top_genes is not None and not all( m is None for m in [min_disp, max_disp, min_mean, max_mean]): logg.info('If you pass `n_top_genes`, all cutoffs are ignored.') start = logg.info('extracting highly variable genes') if not isinstance(adata, AnnData): raise ValueError( '`pp.highly_variable_genes` expects an `AnnData` argument, ' 'pass `inplace=False` if you want to return a `pd.DataFrame`.') if flavor == 'seurat_v3': return _highly_variable_genes_seurat_v3( adata, layer=layer, n_top_genes=n_top_genes, batch_key=batch_key, span=span, subset=subset, inplace=inplace, ) if batch_key is None: df = _highly_variable_genes_single_batch( adata, layer=layer, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_genes=n_top_genes, n_bins=n_bins, flavor=flavor, ) else: sanitize_anndata(adata) batches = adata.obs[batch_key].cat.categories df = [] gene_list = adata.var_names for batch in batches: adata_subset = adata[adata.obs[batch_key] == batch] # Filter to genes that are in the dataset with settings.verbosity.override(Verbosity.error): filt = filter_genes(adata_subset, min_cells=1, inplace=False)[0] adata_subset = adata_subset[:, filt] hvg = _highly_variable_genes_single_batch( adata_subset, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_genes=n_top_genes, n_bins=n_bins, flavor=flavor, ) # Add 0 values for genes that were filtered out missing_hvg = pd.DataFrame( np.zeros((np.sum(~filt), len(hvg.columns))), columns=hvg.columns, ) missing_hvg['highly_variable'] = missing_hvg[ 'highly_variable'].astype(bool) missing_hvg['gene'] = gene_list[~filt] hvg['gene'] = adata_subset.var_names.values hvg = hvg.append(missing_hvg, ignore_index=True) # Order as before filtering idxs = np.concatenate((np.where(filt)[0], np.where(~filt)[0])) hvg = hvg.loc[np.argsort(idxs)] df.append(hvg) df = pd.concat(df, axis=0) df['highly_variable'] = df['highly_variable'].astype(int) df = df.groupby('gene').agg( dict( means=np.nanmean, dispersions=np.nanmean, dispersions_norm=np.nanmean, highly_variable=np.nansum, )) df.rename(columns=dict(highly_variable='highly_variable_nbatches'), inplace=True) df['highly_variable_intersection'] = df[ 'highly_variable_nbatches'] == len(batches) if n_top_genes is not None: # sort genes by how often they selected as hvg within each batch and # break ties with normalized dispersion across batches df.sort_values( ['highly_variable_nbatches', 'dispersions_norm'], ascending=False, na_position='last', inplace=True, ) df['highly_variable'] = False df.highly_variable.iloc[:n_top_genes] = True df = df.loc[adata.var_names] else: df = df.loc[adata.var_names] dispersion_norm = df.dispersions_norm.values dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce(( df.means > min_mean, df.means < max_mean, df.dispersions_norm > min_disp, df.dispersions_norm < max_disp, )) df['highly_variable'] = gene_subset logg.info(' finished', time=start) if inplace or subset: adata.uns['hvg'] = {'flavor': flavor} logg.hint('added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'dispersions\', float vector (adata.var)\n' ' \'dispersions_norm\', float vector (adata.var)') adata.var['highly_variable'] = df['highly_variable'].values adata.var['means'] = df['means'].values adata.var['dispersions'] = df['dispersions'].values adata.var['dispersions_norm'] = df['dispersions_norm'].values.astype( 'float32', copy=False) if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches'].values adata.var['highly_variable_intersection'] = df[ 'highly_variable_intersection'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: return df
def load_query_data( cls, adata: AnnData, reference_model: Union[str, BaseModelClass], inplace_subset_query_vars: bool = False, use_gpu: Optional[Union[str, int, bool]] = None, unfrozen: bool = False, freeze_dropout: bool = False, freeze_expression: bool = True, freeze_decoder_first_layer: bool = True, freeze_batchnorm_encoder: bool = True, freeze_batchnorm_decoder: bool = False, freeze_classifier: bool = True, ): """ Online update of a reference model with scArches algorithm [Lotfollahi21]_. Parameters ---------- adata AnnData organized in the same way as data used to train model. It is not necessary to run setup_anndata, as AnnData is validated against the saved `scvi` setup dictionary. reference_model Either an already instantiated model of the same class, or a path to saved outputs for reference model. inplace_subset_query_vars Whether to subset and rearrange query vars inplace based on vars used to train reference model. use_gpu Load model on default GPU if available (if None or True), or index of GPU to use (if int), or name of GPU (if str), or use CPU (if False). unfrozen Override all other freeze options for a fully unfrozen model freeze_dropout Whether to freeze dropout during training freeze_expression Freeze neurons corersponding to expression in first layer freeze_decoder_first_layer Freeze neurons corersponding to first layer in decoder freeze_batchnorm_encoder Whether to freeze batchnorm weight and bias during training for encoder freeze_batchnorm_decoder Whether to freeze batchnorm weight and bias during training for decoder freeze_classifier Whether to freeze classifier completely. Only applies to `SCANVI`. """ use_gpu, device = parse_use_gpu_arg(use_gpu) if isinstance(reference_model, str): attr_dict, var_names, load_state_dict, _ = _load_saved_files( reference_model, load_adata=False, map_location=device) else: attr_dict = reference_model._get_user_attributes() attr_dict = {a[0]: a[1] for a in attr_dict if a[0][-1] == "_"} var_names = reference_model.adata.var_names load_state_dict = deepcopy(reference_model.module.state_dict()) if inplace_subset_query_vars: logger.debug("Subsetting query vars to reference vars.") adata._inplace_subset_var(var_names) _validate_var_names(adata, var_names) if "scvi_setup_dict_" in attr_dict: scvi_setup_dict = attr_dict.pop("scvi_setup_dict_") cls.register_manager( manager_from_setup_dict(cls, adata, scvi_setup_dict, extend_categories=True)) else: registry = attr_dict.pop("registry_") if (_MODEL_NAME_KEY in registry and registry[_MODEL_NAME_KEY] != cls.__name__): raise ValueError( "It appears you are loading a model from a different class." ) if _SETUP_KWARGS_KEY not in registry: raise ValueError( "Saved model does not contain original setup inputs. " "Cannot load the original setup.") cls.setup_anndata(adata, source_registry=registry, extend_categories=True, **registry[_SETUP_KWARGS_KEY]) model = _initialize_model(cls, adata, attr_dict) adata_manager = model.get_anndata_manager(adata, required=True) version_split = adata_manager.registry[ _constants._SCVI_VERSION_KEY].split(".") if version_split[1] < "8" and version_split[0] == "0": warnings.warn( "Query integration should be performed using models trained with version >= 0.8" ) model.to_device(device) # model tweaking new_state_dict = model.module.state_dict() for key, load_ten in load_state_dict.items(): new_ten = new_state_dict[key] if new_ten.size() == load_ten.size(): continue # new categoricals changed size else: dim_diff = new_ten.size()[-1] - load_ten.size()[-1] fixed_ten = torch.cat([load_ten, new_ten[..., -dim_diff:]], dim=-1) load_state_dict[key] = fixed_ten model.module.load_state_dict(load_state_dict) model.module.eval() _set_params_online_update( model.module, unfrozen=unfrozen, freeze_decoder_first_layer=freeze_decoder_first_layer, freeze_batchnorm_encoder=freeze_batchnorm_encoder, freeze_batchnorm_decoder=freeze_batchnorm_decoder, freeze_dropout=freeze_dropout, freeze_expression=freeze_expression, freeze_classifier=freeze_classifier, ) model.is_trained_ = False return model
def load_query_data( cls, adata: AnnData, reference_model: Union[str, BaseModelClass], inplace_subset_query_vars: bool = False, use_gpu: bool = True, unfrozen: bool = False, freeze_dropout: bool = False, freeze_expression: bool = True, freeze_decoder_first_layer: bool = True, freeze_batchnorm_encoder: bool = True, freeze_batchnorm_decoder: bool = False, freeze_classifier: bool = True, ): """ Online update of a reference model with scArches algorithm [Lotfollahi20]_. Parameters ---------- adata AnnData organized in the same way as data used to train model. It is not necessary to run :func:`~scvi.data.setup_anndata`, as AnnData is validated against the saved `scvi` setup dictionary. reference_model Either an already instantiated model of the same class, or a path to saved outputs for reference model. inplace_subset_query_vars Whether to subset and rearrange query vars inplace based on vars used to train reference model. use_gpu Whether to load model on GPU. unfrozen Override all other freeze options for a fully unfrozen model freeze_dropout Whether to freeze dropout during training freeze_expression Freeze neurons corersponding to expression in first layer freeze_decoder_first_layer Freeze neurons corersponding to first layer in decoder freeze_batchnorm_encoder Whether to freeze batchnorm weight and bias during training for encoder freeze_batchnorm_decoder Whether to freeze batchnorm weight and bias during training for decoder freeze_classifier Whether to freeze classifier completely. Only applies to `SCANVI`. """ use_gpu = use_gpu and torch.cuda.is_available() if isinstance(reference_model, str): map_location = torch.device("cuda") if use_gpu is True else None ( scvi_setup_dict, attr_dict, var_names, load_state_dict, _, ) = _load_saved_files(reference_model, load_adata=False, map_location=map_location) else: attr_dict = reference_model._get_user_attributes() attr_dict = {a[0]: a[1] for a in attr_dict if a[0][-1] == "_"} scvi_setup_dict = attr_dict.pop("scvi_setup_dict_") var_names = reference_model.adata.var_names load_state_dict = reference_model.model.state_dict().copy() if inplace_subset_query_vars: logger.debug("Subsetting query vars to reference vars.") adata._inplace_subset_var(var_names) _validate_var_names(adata, var_names) if scvi_setup_dict["scvi_version"] < "0.8": logger.warning( "Query integration should be performed using models trained with version >= 0.8" ) transfer_anndata_setup(scvi_setup_dict, adata, extend_categories=True) model = _initialize_model(cls, adata, attr_dict, use_gpu) # set saved attrs for loaded model for attr, val in attr_dict.items(): setattr(model, attr, val) if use_gpu: model.model.cuda() # model tweaking new_state_dict = model.model.state_dict() for key, load_ten in load_state_dict.items(): new_ten = new_state_dict[key] if new_ten.size() == load_ten.size(): continue # new categoricals changed size else: dim_diff = new_ten.size()[-1] - load_ten.size()[-1] fixed_ten = torch.cat([load_ten, new_ten[..., -dim_diff:]], dim=-1) load_state_dict[key] = fixed_ten model.model.load_state_dict(load_state_dict) model.model.eval() _set_params_online_update( model.model, unfrozen=unfrozen, freeze_decoder_first_layer=freeze_decoder_first_layer, freeze_batchnorm_encoder=freeze_batchnorm_encoder, freeze_batchnorm_decoder=freeze_batchnorm_decoder, freeze_dropout=freeze_dropout, freeze_expression=freeze_expression, freeze_classifier=freeze_classifier, ) model.is_trained_ = False return model
def _highly_variable_pearson_residuals( adata: AnnData, theta: float = 100, clip: Optional[float] = None, n_top_genes: int = 1000, batch_key: Optional[str] = None, chunksize: int = 1000, check_values: bool = True, layer: Optional[str] = None, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ See `scanpy.experimental.pp.highly_variable_genes`. Returns ------- If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, returns the same fields as :class:`~pandas.DataFrame`. highly_variable : bool boolean indicator of highly-variable genes means : float means per gene variances : float variance per gene residual_variances : float Residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank : float Rank of the gene according to residual variance, median rank in the case of multiple batches highly_variable_nbatches : int If `batch_key` given, denotes in how many batches genes are detected as HVG highly_variable_intersection : bool If `batch_key` given, denotes the genes that are highly variable in all batches """ view_to_actual(adata) X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' # Check for raw counts if check_values and (check_nonnegative_integers(X) is False): warnings.warn( "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", UserWarning, ) # check theta if theta <= 0: # TODO: would "underdispersion" with negative theta make sense? # then only theta=0 were undefined.. raise ValueError('Pearson residuals require theta > 0') # prepare clipping if batch_key is None: batch_info = np.zeros(adata.shape[0], dtype=int) else: batch_info = adata.obs[batch_key].values n_batches = len(np.unique(batch_info)) # Get pearson residuals for each batch separately residual_gene_vars = [] for batch in np.unique(batch_info): adata_subset_prefilter = adata[batch_info == batch] X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer) # Filter out zero genes with settings.verbosity.override(Verbosity.error): nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0 adata_subset = adata_subset_prefilter[:, nonzero_genes] X_batch = _get_obs_rep(adata_subset, layer=layer) # Prepare clipping if clip is None: n = X_batch.shape[0] clip = np.sqrt(n) if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") if sp_sparse.issparse(X_batch): sums_genes = np.sum(X_batch, axis=0) sums_cells = np.sum(X_batch, axis=1) sum_total = np.sum(sums_genes).squeeze() else: sums_genes = np.sum(X_batch, axis=0, keepdims=True) sums_cells = np.sum(X_batch, axis=1, keepdims=True) sum_total = np.sum(sums_genes) # Compute pearson residuals in chunks residual_gene_var = np.empty((X_batch.shape[1])) for start in np.arange(0, X_batch.shape[1], chunksize): stop = start + chunksize mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) X_dense = X_batch[:, start:stop].toarray() residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta) residuals = np.clip(residuals, a_min=-clip, a_max=clip) residual_gene_var[start:stop] = np.var(residuals, axis=0) # Add 0 values for genes that were filtered out unmasked_residual_gene_var = np.zeros(len(nonzero_genes)) unmasked_residual_gene_var[nonzero_genes] = residual_gene_var residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1)) residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) # Get rank per gene within each batch # argsort twice gives ranks, small rank means most variable ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) ranks_residual_var = ranks_residual_var.astype(np.float32) # count in how many batches a genes was among the n_top_genes highly_variable_nbatches = np.sum( (ranks_residual_var < n_top_genes).astype(int), axis=0 ) # set non-top genes within each batch to nan ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) # Median rank across batches, ignoring batches in which gene was not selected medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) means, variances = materialize_as_ndarray(_get_mean_var(X)) df = pd.DataFrame.from_dict( dict( means=means, variances=variances, residual_variances=np.mean(residual_gene_vars, axis=0), highly_variable_rank=medianrank_residual_var, highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), highly_variable_intersection=highly_variable_nbatches == n_batches, ) ) df = df.set_index(adata.var_names) # Sort genes by how often they selected as hvg within each batch and # break ties with median rank of residual variance across batches df.sort_values( ['highly_variable_nbatches', 'highly_variable_rank'], ascending=[False, True], na_position='last', inplace=True, ) high_var = np.zeros(df.shape[0], dtype=bool) high_var[:n_top_genes] = True df['highly_variable'] = high_var df = df.loc[adata.var_names, :] if inplace: adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} logg.hint( 'added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'highly_variable_rank\', float vector (adata.var)\n' ' \'highly_variable_nbatches\', int vector (adata.var)\n' ' \'highly_variable_intersection\', boolean vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'variances\', float vector (adata.var)\n' ' \'residual_variances\', float vector (adata.var)' ) adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values adata.var['residual_variances'] = df['residual_variances'] adata.var['highly_variable_rank'] = df['highly_variable_rank'].values if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches' ].values adata.var['highly_variable_intersection'] = df[ 'highly_variable_intersection' ].values adata.var['highly_variable'] = df['highly_variable'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: if batch_key is None: df = df.drop( ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 ) if subset: df = df.iloc[df.highly_variable.values, :] return df
def highly_variable_features( adata: AnnData, min_disp: Optional[float] = None, max_disp: Optional[float] = None, min_mean: Optional[float] = None, max_mean: Optional[float] = None, n_top_features: Optional[int] = None, n_bins: int = 20, flavor: Literal['seurat', 'cell_ranger'] = 'seurat', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, ) -> Optional[pd.DataFrame]: """\ Annotate highly variable features [Satija15]_ [Zheng17]_. Expects logarithmized data. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_ and Cell Ranger [Zheng17]_. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected. Parameters ---------- adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to subjects and columns to features. min_mean If `n_top_features` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Default is 0.0125. max_mean If `n_top_features` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Default is 3. min_disp If `n_top_features` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Default is 0.5. max_disp If `n_top_features` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. Default is `np.inf`. n_top_features Number of highly-variable features to keep. n_bins Number of bins for binning the mean feature expression. Normalization is done with respect to each bin. If just a single feature falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. flavor Choose the flavor for computing normalized dispersion. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_features`. subset Inplace subset to highly-variable features if `True` otherwise merely indicate highly variable features. inplace Whether to place calculated metrics in `.var` or return them. batch_key If specified, highly-variable features are selected within each batch separately and merged. This simple process avoids the selection of batch-specific features and acts as a lightweight batch correction method. Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or updates `.var` with the following fields highly_variable : bool boolean indicator of highly-variable features **means** means per feature **dispersions** dispersions per feature **dispersions_norm** normalized dispersions per feature highly_variable_nbatches : int If batch_key is given, this denotes in how many batches features are detected as HVG highly_variable_intersection : bool If batch_key is given, this denotes the features that are highly variable in all batches Notes ----- This function replaces :func:`~quanp.pp.filter_features_dispersion`. """ if n_top_features is not None and not all( m is None for m in [min_disp, max_disp, min_mean, max_mean]): logg.info('If you pass `n_top_features`, all cutoffs are ignored.') if min_disp is None: min_disp = 0.5 if min_mean is None: min_mean = 0.0125 if max_mean is None: max_mean = 3 if max_disp is None: max_disp = np.inf start = logg.info('extracting highly variable features') if not isinstance(adata, AnnData): raise ValueError( '`pp.highly_variable_features` expects an `AnnData` argument, ' 'pass `inplace=False` if you want to return a `pd.DataFrame`.') if batch_key is None: df = _highly_variable_features_single_batch( adata, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_features=n_top_features, n_bins=n_bins, flavor=flavor, ) else: sanitize_anndata(adata) batches = adata.obs[batch_key].cat.categories df = [] feature_list = adata.var_names for batch in batches: adata_subset = adata[adata.obs[batch_key] == batch] # Filter to features that are in the dataset with settings.verbosity.override(Verbosity.error): filt = filter_features(adata_subset, min_subjects=1, inplace=False)[0] adata_subset = adata_subset[:, filt] hvg = _highly_variable_features_single_batch( adata_subset, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_features=n_top_features, n_bins=n_bins, flavor=flavor, ) # Add 0 values for features that were filtered out missing_hvg = pd.DataFrame( np.zeros((np.sum(~filt), len(hvg.columns))), columns=hvg.columns, ) missing_hvg['highly_variable'] = missing_hvg[ 'highly_variable'].astype(bool) missing_hvg['feature'] = feature_list[~filt] hvg['feature'] = adata_subset.var_names.values hvg = hvg.append(missing_hvg, ignore_index=True) # Order as before filtering idxs = np.concatenate((np.where(filt)[0], np.where(~filt)[0])) hvg = hvg.loc[np.argsort(idxs)] df.append(hvg) df = pd.concat(df, axis=0) df['highly_variable'] = df['highly_variable'].astype(int) df = df.groupby('feature').agg( dict( means=np.nanmean, dispersions=np.nanmean, dispersions_norm=np.nanmean, highly_variable=np.nansum, )) df.rename(columns=dict(highly_variable='highly_variable_nbatches'), inplace=True) df['highly_variable_intersection'] = df[ 'highly_variable_nbatches'] == len(batches) if n_top_features is not None: # sort features by how often they selected as hvg within each batch and # break ties with normalized dispersion across batches df.sort_values( ['highly_variable_nbatches', 'dispersions_norm'], ascending=False, na_position='last', inplace=True, ) df['highly_variable'] = False df.loc[:n_top_features, 'highly_variable'] = True df = df.loc[adata.var_names] else: df = df.loc[adata.var_names] dispersion_norm = df.dispersions_norm.values dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat feature_subset = np.logical_and.reduce(( df.means > min_mean, df.means < max_mean, df.dispersions_norm > min_disp, df.dispersions_norm < max_disp, )) df['highly_variable'] = feature_subset logg.info(' finished', time=start) if inplace or subset: logg.hint('added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'dispersions\', float vector (adata.var)\n' ' \'dispersions_norm\', float vector (adata.var)') adata.var['highly_variable'] = df['highly_variable'].values adata.var['means'] = df['means'].values adata.var['dispersions'] = df['dispersions'].values adata.var['dispersions_norm'] = df['dispersions_norm'].values.astype( 'float32', copy=False) if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches'].values adata.var['highly_variable_intersection'] = df[ 'highly_variable_intersection'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: return df
def recipe_zheng17( adata: AnnData, n_top_genes: int = 1000, log: bool = True, plot: bool = False, copy: bool = False, ) -> Optional[AnnData]: """\ Normalization and filtering as of [Zheng17]_. Reproduces the preprocessing of [Zheng17]_ – the Cell Ranger R Kit of 10x Genomics. Expects non-logarithmized data. If using logarithmized data, pass `log=False`. The recipe runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=1) # only consider genes with more than 1 count sc.pp.normalize_per_cell( # normalize with total UMI count per cell adata, key_n_counts='n_counts_all' ) filter_result = sc.pp.filter_genes_dispersion( # select highly-variable genes adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False ) adata = adata[:, filter_result.gene_subset] # subset the genes sc.pp.normalize_per_cell(adata) # renormalize after filtering if log: sc.pp.log1p(adata) # log transform: adata.X = log(adata.X + 1) sc.pp.scale(adata) # scale to unit variance and shift to zero mean Parameters ---------- adata Annotated data matrix. n_top_genes Number of genes to keep. log Take logarithm. plot Show a plot of the gene dispersion vs. mean relation. copy Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ start = logg.info('running recipe zheng17') if copy: adata = adata.copy() # only consider genes with more than 1 count pp.filter_genes(adata, min_counts=1) # normalize with total UMI count per cell normalize_total(adata, key_added='n_counts_all') filter_result = filter_genes_dispersion( adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False ) if plot: # should not import at the top of the file from ..plotting import _preprocessing as ppp ppp.filter_genes_dispersion(filter_result, log=True) # actually filter the genes, the following is the inplace version of # adata = adata[:, filter_result.gene_subset] adata._inplace_subset_var(filter_result.gene_subset) # filter genes normalize_total(adata) # renormalize after filtering if log: pp.log1p(adata) # log transform: X = log(X + 1) pp.scale(adata) logg.info(' finished', time=start) return adata if copy else None