def get_accessibility_estimates( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, n_samples_overall: Optional[int] = None, region_indices: Sequence[int] = None, transform_batch: Optional[Union[str, int]] = None, use_z_mean: bool = True, threshold: Optional[float] = None, normalize_cells: bool = False, normalize_regions: bool = False, batch_size: int = 128, ) -> Union[np.ndarray, csr_matrix]: adata = self._validate_anndata(adata) if indices is None: indices = np.arange(adata.n_obs) if n_samples_overall is not None: indices = np.random.choice(indices, n_samples_overall) post = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if threshold is not None and (threshold < 0 or threshold > 1): raise ValueError("the provided threshold must be between 0 and 1") imputed = [] for tensors in post: get_generative_input_kwargs = dict( transform_batch=transform_batch[0]) generative_kwargs = dict(use_z_mean=use_z_mean) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, get_generative_input_kwargs=get_generative_input_kwargs, generative_kwargs=generative_kwargs, compute_loss=False, ) p = generative_outputs["p"].cpu() if normalize_cells: p *= inference_outputs["libsize_acc"].cpu() if normalize_regions: p *= torch.sigmoid(self.module.region_factors).cpu() if threshold: p[p < threshold] = 0 p = csr_matrix(p.numpy()) if region_indices is not None: p = p[:, region_indices] imputed.append(p) if threshold: # imputed is a list of csr_matrix objects imputed = vstack(imputed, format="csr") else: # imputed is a list of tensors imputed = torch.cat(imputed).numpy() return imputed
def get_accessibility_estimates( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, region_indices: Sequence[int] = None, transform_batch: Optional[Union[str, int]] = None, use_z_mean: bool = True, threshold: Optional[float] = None, normalize_cells: bool = False, normalize_regions: bool = False, batch_size: int = 128, ) -> Union[np.ndarray, csr_matrix]: """ Impute the full accessibility matrix. Returns a matrix of accessibility probabilities for each cell and genomic region in the input (for return matrix A, A[i,j] is the probability that region j is accessible in cell i). Parameters ---------- adata AnnData object that has been registered with scvi. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. region_indices Indices of regions to use. if `None`, all regions are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used use_z_mean If True (default), use the distribution mean. Otherwise, sample from the distribution. threshold If provided, values below the threshold are replaced with 0 and a sparse matrix is returned instead. This is recommended for very large matrices. Must be between 0 and 1. normalize_cells Whether to reintroduce library size factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the library size correction. False by default. normalize_regions Whether to reintroduce region factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the region-level bias correction. False by default. batch_size Minibatch size for data loading into model """ adata = self._validate_anndata(adata) post = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if threshold is not None and (threshold < 0 or threshold > 1): raise ValueError("the provided threshold must be between 0 and 1") imputed = [] for tensors in post: get_generative_input_kwargs = dict( transform_batch=transform_batch[0]) generative_kwargs = dict(use_z_mean=use_z_mean) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, get_generative_input_kwargs=get_generative_input_kwargs, generative_kwargs=generative_kwargs, compute_loss=False, ) p = generative_outputs["p"].cpu() if normalize_cells: p *= inference_outputs["d"].cpu() if normalize_regions: p *= torch.sigmoid(self.module.region_factors).cpu() if threshold: p[p < threshold] = 0 p = csr_matrix(p.numpy()) if region_indices is not None: p = p[:, region_indices] imputed.append(p) if threshold: # imputed is a list of csr_matrix objects imputed = vstack(imputed, format="csr") else: # imputed is a list of tensors imputed = torch.cat(imputed).numpy() return imputed
def get_normalized_expression( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples_overall: Optional[int] = None, transform_batch: Optional[Sequence[Union[Number, str]]] = None, gene_list: Optional[Sequence[str]] = None, use_z_mean: bool = True, n_samples: int = 1, batch_size: Optional[int] = None, return_mean: bool = True, ) -> Union[np.ndarray, pd.DataFrame]: r""" Returns the normalized (decoded) gene expression. This is denoted as :math:`\rho_n` in the scVI paper. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. If set to `"latent"`, use the latent libary size. use_z_mean If True, use the mean of the latent distribution, otherwise sample from it n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. Returns ------- If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`. Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True. """ adata = self._validate_anndata(adata) if indices is None: indices = np.arange(adata.n_obs) if n_samples_overall is not None: indices = np.random.choice(indices, n_samples_overall) scdl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [gene in gene_list for gene in all_genes] exprs = [] for tensors in scdl: per_batch_exprs = [] for batch in transform_batch: if batch is not None: batch_indices = tensors[_CONSTANTS.BATCH_KEY] tensors[_CONSTANTS.BATCH_KEY] = ( torch.ones_like(batch_indices) * batch) _, generative_outputs = self.module.forward( tensors=tensors, inference_kwargs=dict(n_samples=n_samples), generative_kwargs=dict(use_z_mean=use_z_mean), compute_loss=False, ) output = generative_outputs["px_scale"] output = output[..., gene_mask] output = output.cpu().numpy() per_batch_exprs.append(output) per_batch_exprs = np.stack( per_batch_exprs ) # shape is (len(transform_batch) x batch_size x n_var) exprs += [per_batch_exprs.mean(0)] if n_samples > 1: # The -2 axis correspond to cells. exprs = np.concatenate(exprs, axis=-2) else: exprs = np.concatenate(exprs, axis=0) if n_samples > 1 and return_mean: exprs = exprs.mean(0) return exprs
def get_feature_correlation_matrix( self, adata=None, indices=None, n_samples: int = 10, batch_size: int = 64, rna_size_factor: int = 1000, transform_batch: Optional[Sequence[Union[Number, str]]] = None, correlation_type: Literal["spearman", "pearson"] = "spearman", log_transform: bool = False, ) -> pd.DataFrame: """ Generate gene-gene correlation matrix using scvi uncertainty and expression. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. rna_size_factor size factor for RNA prior to sampling gamma distribution transform_batch Batches to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used - list of int, then values are averaged over provided batches. correlation_type One of "pearson", "spearman". log_transform Whether to log transform denoised values prior to correlation calculation. Returns ------- Gene-protein-gene-protein correlation matrix """ from scipy.stats import spearmanr adata = self._validate_anndata(adata) if not isinstance(transform_batch, IterableClass): transform_batch = [transform_batch] transform_batch = _get_batch_code_from_category(adata, transform_batch) corr_mats = [] for b in transform_batch: denoised_data = self._get_denoised_samples( n_samples=n_samples, batch_size=batch_size, rna_size_factor=rna_size_factor, transform_batch=b, ) flattened = np.zeros( (denoised_data.shape[0] * n_samples, denoised_data.shape[1]) ) for i in range(n_samples): flattened[ denoised_data.shape[0] * (i) : denoised_data.shape[0] * (i + 1) ] = denoised_data[:, :, i] if log_transform is True: flattened[:, : self.n_genes] = np.log( flattened[:, : self.n_genes] + 1e-8 ) flattened[:, self.n_genes :] = np.log1p(flattened[:, self.n_genes :]) if correlation_type == "pearson": corr_matrix = np.corrcoef(flattened, rowvar=False) else: corr_matrix, _ = spearmanr(flattened, axis=0) corr_mats.append(corr_matrix) corr_matrix = np.mean(np.stack(corr_mats), axis=0) var_names = _get_var_names_from_setup_anndata(adata) names = np.concatenate( [np.asarray(var_names), self.scvi_setup_dict_["protein_names"]] ) return pd.DataFrame(corr_matrix, index=names, columns=names)
def get_protein_foreground_probability( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, transform_batch: Optional[Sequence[Union[Number, str]]] = None, protein_list: Optional[Sequence[str]] = None, n_samples: int = 1, batch_size: Optional[int] = None, return_mean: bool = True, return_numpy: Optional[bool] = None, ): r""" Returns the foreground probability for proteins. This is denoted as :math:`(1 - \pi_{nt})` in the totalVI paper. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used - List[int], then average over batches in list protein_list Return protein expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. return_numpy Return a :class:`~numpy.ndarray` instead of a :class:`~pandas.DataFrame`. DataFrame includes gene names as columns. If either `n_samples=1` or `return_mean=True`, defaults to `False`. Otherwise, it defaults to `True`. Returns ------- - **foreground_probability** - probability foreground for each protein If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`. Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True. """ adata = self._validate_anndata(adata) post = self._make_data_loader( adata=adata, indices=indices, batch_size=batch_size ) if protein_list is None: protein_mask = slice(None) else: all_proteins = self.scvi_setup_dict_["protein_names"] protein_mask = [True if p in protein_list else False for p in all_proteins] if n_samples > 1 and return_mean is False: if return_numpy is False: warnings.warn( "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray" ) return_numpy = True if indices is None: indices = np.arange(adata.n_obs) py_mixings = [] if not isinstance(transform_batch, IterableClass): transform_batch = [transform_batch] transform_batch = _get_batch_code_from_category(adata, transform_batch) for tensors in post: y = tensors[_CONSTANTS.PROTEIN_EXP_KEY] py_mixing = torch.zeros_like(y[..., protein_mask]) if n_samples > 1: py_mixing = torch.stack(n_samples * [py_mixing]) for b in transform_batch: if b is not None: batch_indices = tensors[_CONSTANTS.BATCH_KEY] tensors[_CONSTANTS.BATCH_KEY] = torch.ones_like(batch_indices) * b inference_kwargs = dict(n_samples=n_samples) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, inference_kwargs=inference_kwargs, compute_loss=False, ) py_mixing += torch.sigmoid(generative_outputs["py_"]["mixing"])[ ..., protein_mask ].cpu() py_mixing /= len(transform_batch) py_mixings += [py_mixing] if n_samples > 1: # concatenate along batch dimension -> result shape = (samples, cells, features) py_mixings = torch.cat(py_mixings, dim=1) # (cells, features, samples) py_mixings = py_mixings.permute(1, 2, 0) else: py_mixings = torch.cat(py_mixings, dim=0) if return_mean is True and n_samples > 1: py_mixings = torch.mean(py_mixings, dim=-1) py_mixings = py_mixings.cpu().numpy() if return_numpy is True: return 1 - py_mixings else: pro_names = self.scvi_setup_dict_["protein_names"] foreground_prob = pd.DataFrame( 1 - py_mixings, columns=pro_names[protein_mask], index=adata.obs_names[indices], ) return foreground_prob
def get_normalized_expression( self, adata=None, indices=None, transform_batch: Optional[Sequence[Union[Number, str]]] = None, gene_list: Optional[Sequence[str]] = None, protein_list: Optional[Sequence[str]] = None, library_size: Optional[Union[float, Literal["latent"]]] = 1, n_samples: int = 1, sample_protein_mixing: bool = False, scale_protein: bool = False, include_protein_background: bool = False, batch_size: Optional[int] = None, return_mean: bool = True, return_numpy: Optional[bool] = None, ) -> Tuple[Union[np.ndarray, pd.DataFrame], Union[np.ndarray, pd.DataFrame]]: r""" Returns the normalized gene expression and protein expression. This is denoted as :math:`\rho_n` in the totalVI paper for genes, and TODO for proteins, :math:`(1-\pi_{nt})\alpha_{nt}\beta_{nt}`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used - List[int], then average over batches in list gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. protein_list Return protein expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. n_samples Get sample scale from multiple samples. sample_protein_mixing Sample mixing bernoulli, setting background to zero scale_protein Make protein expression sum to 1 include_protein_background Include background component for protein expression batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. return_numpy Return a `np.ndarray` instead of a `pd.DataFrame`. Includes gene names as columns. If either n_samples=1 or return_mean=True, defaults to False. Otherwise, it defaults to True. Returns ------- - **gene_normalized_expression** - normalized expression for RNA - **protein_normalized_expression** - normalized expression for proteins If ``n_samples`` > 1 and ``return_mean`` is False, then the shape is ``(samples, cells, genes)``. Otherwise, shape is ``(cells, genes)``. Return type is ``pd.DataFrame`` unless ``return_numpy`` is True. """ adata = self._validate_anndata(adata) post = self._make_data_loader( adata=adata, indices=indices, batch_size=batch_size ) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if protein_list is None: protein_mask = slice(None) else: all_proteins = self.scvi_setup_dict_["protein_names"] protein_mask = [True if p in protein_list else False for p in all_proteins] if indices is None: indices = np.arange(adata.n_obs) if n_samples > 1 and return_mean is False: if return_numpy is False: warnings.warn( "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray" ) return_numpy = True if not isinstance(transform_batch, IterableClass): transform_batch = [transform_batch] transform_batch = _get_batch_code_from_category(adata, transform_batch) scale_list_gene = [] scale_list_pro = [] for tensors in post: x = tensors[_CONSTANTS.X_KEY] y = tensors[_CONSTANTS.PROTEIN_EXP_KEY] px_scale = torch.zeros_like(x) py_scale = torch.zeros_like(y) if n_samples > 1: px_scale = torch.stack(n_samples * [px_scale]) py_scale = torch.stack(n_samples * [py_scale]) for b in transform_batch: if b is not None: batch_indices = tensors[_CONSTANTS.BATCH_KEY] tensors[_CONSTANTS.BATCH_KEY] = torch.ones_like(batch_indices) * b inference_kwargs = dict(n_samples=n_samples) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, inference_kwargs=inference_kwargs, compute_loss=False, ) if library_size == "latent": px_scale += generative_outputs["px_"]["rate"].cpu() else: px_scale += generative_outputs["px_"]["scale"].cpu() px_scale = px_scale[..., gene_mask] py_ = generative_outputs["py_"] # probability of background protein_mixing = 1 / (1 + torch.exp(-py_["mixing"].cpu())) if sample_protein_mixing is True: protein_mixing = torch.distributions.Bernoulli( protein_mixing ).sample() protein_val = py_["rate_fore"].cpu() * (1 - protein_mixing) if include_protein_background is True: protein_val += py_["rate_back"].cpu() * protein_mixing if scale_protein is True: protein_val = torch.nn.functional.normalize( protein_val, p=1, dim=-1 ) protein_val = protein_val[..., protein_mask] py_scale += protein_val px_scale /= len(transform_batch) py_scale /= len(transform_batch) scale_list_gene.append(px_scale) scale_list_pro.append(py_scale) if n_samples > 1: # concatenate along batch dimension -> result shape = (samples, cells, features) scale_list_gene = torch.cat(scale_list_gene, dim=1) scale_list_pro = torch.cat(scale_list_pro, dim=1) # (cells, features, samples) scale_list_gene = scale_list_gene.permute(1, 2, 0) scale_list_pro = scale_list_pro.permute(1, 2, 0) else: scale_list_gene = torch.cat(scale_list_gene, dim=0) scale_list_pro = torch.cat(scale_list_pro, dim=0) if return_mean is True and n_samples > 1: scale_list_gene = torch.mean(scale_list_gene, dim=-1) scale_list_pro = torch.mean(scale_list_pro, dim=-1) scale_list_gene = scale_list_gene.cpu().numpy() scale_list_pro = scale_list_pro.cpu().numpy() if return_numpy is None or return_numpy is False: gene_df = pd.DataFrame( scale_list_gene, columns=adata.var_names[gene_mask], index=adata.obs_names[indices], ) pro_df = pd.DataFrame( scale_list_pro, columns=self.scvi_setup_dict_["protein_names"][protein_mask], index=adata.obs_names[indices], ) return gene_df, pro_df else: return scale_list_gene, scale_list_pro
def get_accessibility_estimates( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, n_samples_overall: Optional[int] = None, region_list: Optional[Sequence[str]] = None, transform_batch: Optional[Union[str, int]] = None, use_z_mean: bool = True, threshold: Optional[float] = None, normalize_cells: bool = False, normalize_regions: bool = False, batch_size: int = 128, return_numpy: bool = False, ) -> Union[pd.DataFrame, np.ndarray, csr_matrix]: """ Impute the full accessibility matrix. Returns a matrix of accessibility probabilities for each cell and genomic region in the input (for return matrix A, A[i,j] is the probability that region j is accessible in cell i). Parameters ---------- adata AnnData object that has been registered with scvi. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples_overall Number of samples to return in total region_list Return accessibility estimates for this subset of regions. if `None`, all regions are used. This can save memory when dealing with large datasets. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used use_z_mean If True (default), use the distribution mean. Otherwise, sample from the distribution. threshold If provided, values below the threshold are replaced with 0 and a sparse matrix is returned instead. This is recommended for very large matrices. Must be between 0 and 1. normalize_cells Whether to reintroduce library size factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the library size correction. False by default. normalize_regions Whether to reintroduce region factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the region-level bias correction. False by default. batch_size Minibatch size for data loading into model return_numpy If `True` and `threshold=None`, return :class:`~numpy.ndarray`. If `True` and `threshold` is given, return :class:`~scipy.sparse.csr_matrix`. If `False`, return :class:`~pandas.DataFrame`. DataFrame includes regions names as columns. """ adata = self._validate_anndata(adata) if indices is None: indices = np.arange(adata.n_obs) if n_samples_overall is not None: indices = np.random.choice(indices, n_samples_overall) post = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if region_list is None: region_mask = slice(None) else: all_regions = _get_var_names_from_setup_anndata(adata) region_mask = [region in region_list for region in all_regions] if threshold is not None and (threshold < 0 or threshold > 1): raise ValueError("the provided threshold must be between 0 and 1") imputed = [] for tensors in post: get_generative_input_kwargs = dict( transform_batch=transform_batch[0]) generative_kwargs = dict(use_z_mean=use_z_mean) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, get_generative_input_kwargs=get_generative_input_kwargs, generative_kwargs=generative_kwargs, compute_loss=False, ) p = generative_outputs["p"].cpu() if normalize_cells: p *= inference_outputs["d"].cpu() if normalize_regions: p *= torch.sigmoid(self.module.region_factors).cpu() if threshold: p[p < threshold] = 0 p = csr_matrix(p.numpy()) if region_list is not None: p = p[:, region_mask] imputed.append(p) if threshold: # imputed is a list of csr_matrix objects imputed = vstack(imputed, format="csr") else: # imputed is a list of tensors imputed = torch.cat(imputed).numpy() if return_numpy: return imputed elif threshold: return pd.DataFrame.sparse.from_spmatrix( imputed, index=adata.obs_names[indices], columns=adata.var_names[region_mask], ) else: return pd.DataFrame( imputed, index=adata.obs_names[indices], columns=adata.var_names[region_mask], )
def get_normalized_expression( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, transform_batch: Optional[Sequence[Union[str, int]]] = None, gene_list: Optional[Sequence[str]] = None, library_size: Union[float, Literal["latent"]] = 1, n_samples: int = 1, batch_size: Optional[int] = None, return_mean: bool = True, return_numpy: Optional[bool] = None, ) -> Union[np.ndarray, pd.DataFrame]: r""" Returns the normalized (decoded) gene expression. This is denoted as :math:`\rho_n` in the scVI paper. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. If set to `"latent"`, use the latent libary size. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. return_numpy Return a :class:`~numpy.ndarray` instead of a :class:`~pandas.DataFrame`. DataFrame includes gene names as columns. If either `n_samples=1` or `return_mean=True`, defaults to `False`. Otherwise, it defaults to `True`. Returns ------- If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`. Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True. """ adata = self._validate_anndata(adata) scdl = self._make_scvi_dl(adata=adata, indices=indices, batch_size=batch_size) if transform_batch is not None: transform_batch = _get_batch_code_from_category(adata, transform_batch) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if n_samples > 1 and return_mean is False: if return_numpy is False: logger.warning( "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray" ) return_numpy = True if indices is None: indices = np.arange(adata.n_obs) if library_size == "latent": model_fn = self.model.get_sample_rate scaling = 1 else: model_fn = self.model.get_sample_scale scaling = library_size exprs = [] for tensors in scdl: x = tensors[_CONSTANTS.X_KEY] batch_idx = tensors[_CONSTANTS.BATCH_KEY] labels = tensors[_CONSTANTS.LABELS_KEY] exprs += [ np.array( ( model_fn( x, batch_index=batch_idx, y=labels, n_samples=n_samples, transform_batch=transform_batch, )[..., gene_mask] * scaling ).cpu() ) ] if n_samples > 1: # The -2 axis correspond to cells. exprs = np.concatenate(exprs, axis=-2) else: exprs = np.concatenate(exprs, axis=0) if n_samples > 1 and return_mean: exprs = exprs.mean(0) if return_numpy is None or return_numpy is False: return pd.DataFrame( exprs, columns=adata.var_names[gene_mask], index=adata.obs_names[indices], ) else: return exprs
def get_feature_correlation_matrix( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 10, batch_size: int = 64, rna_size_factor: int = 1000, transform_batch: Optional[Sequence[Union[Number, str]]] = None, correlation_type: Literal["spearman", "pearson"] = "spearman", ) -> pd.DataFrame: """ Generate gene-gene correlation matrix using scvi uncertainty and expression. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. rna_size_factor size factor for RNA prior to sampling gamma distribution. transform_batch Batches to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. - list of int, then values are averaged over provided batches. correlation_type One of "pearson", "spearman". Returns ------- Gene-gene correlation matrix """ from scipy.stats import spearmanr adata = self._validate_anndata(adata) transform_batch = _get_batch_code_from_category(adata, transform_batch) corr_mats = [] for b in transform_batch: denoised_data = self._get_denoised_samples( adata=adata, indices=indices, n_samples=n_samples, batch_size=batch_size, rna_size_factor=rna_size_factor, transform_batch=b, ) flattened = np.zeros( (denoised_data.shape[0] * n_samples, denoised_data.shape[1])) for i in range(n_samples): flattened[denoised_data.shape[0] * (i):denoised_data.shape[0] * (i + 1)] = denoised_data[:, :, i] if correlation_type == "pearson": corr_matrix = np.corrcoef(flattened, rowvar=False) elif correlation_type == "spearman": corr_matrix, _ = spearmanr(flattened) else: raise ValueError( "Unknown correlation type. Choose one of 'spearman', 'pearson'." ) corr_mats.append(corr_matrix) corr_matrix = np.mean(np.stack(corr_mats), axis=0) var_names = _get_var_names_from_setup_anndata(adata) return pd.DataFrame(corr_matrix, index=var_names, columns=var_names)