def setup_anndata( cls, adata: AnnData, layer: Optional[str] = None, **kwargs, ): """ %(summary)s. Parameters ---------- %(param_layer)s """ setup_method_args = cls._get_setup_method_args(**locals()) # add index for each cell (provided to pyro plate for correct minibatching) adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64") anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True), NumericalObsField(REGISTRY_KEYS.INDICES_KEY, "_indices"), ] adata_manager = AnnDataManager( fields=anndata_fields, setup_method_args=setup_method_args ) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)
def setup_anndata( cls, adata: AnnData, batch_key: Optional[str] = None, labels_key: Optional[str] = None, categorical_covariate_keys: Optional[List[str]] = None, continuous_covariate_keys: Optional[List[str]] = None, layer: Optional[str] = None, **kwargs, ): """ %(summary)s. Parameters ---------- %(param_batch_key)s %(param_labels_key)s %(param_layer)s %(param_cat_cov_keys)s %(param_cont_cov_keys)s """ setup_method_args = cls._get_setup_method_args(**locals()) anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True), CategoricalObsField(REGISTRY_KEYS.BATCH_KEY, batch_key), CategoricalObsField(REGISTRY_KEYS.LABELS_KEY, labels_key), CategoricalJointObsField(REGISTRY_KEYS.CAT_COVS_KEY, categorical_covariate_keys), NumericalJointObsField(REGISTRY_KEYS.CONT_COVS_KEY, continuous_covariate_keys), ] adata_manager = AnnDataManager(fields=anndata_fields, setup_method_args=setup_method_args) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)
def setup_anndata( cls, adata: AnnData, labels_key: Optional[str] = None, layer: Optional[str] = None, **kwargs, ): """ %(summary)s. Parameters ---------- %(param_labels_key)s %(param_layer)s """ setup_method_args = cls._get_setup_method_args(**locals()) anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True), CategoricalObsField(REGISTRY_KEYS.LABELS_KEY, labels_key), ] adata_manager = AnnDataManager( fields=anndata_fields, setup_method_args=setup_method_args ) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)
def generic_setup_adata_manager( adata: AnnData, batch_key: Optional[str] = None, labels_key: Optional[str] = None, categorical_covariate_keys: Optional[List[str]] = None, continuous_covariate_keys: Optional[List[str]] = None, layer: Optional[str] = None, protein_expression_obsm_key: Optional[str] = None, protein_names_uns_key: Optional[str] = None, ) -> AnnDataManager: batch_field = CategoricalObsField(REGISTRY_KEYS.BATCH_KEY, batch_key) anndata_fields = [ batch_field, LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True), CategoricalObsField(REGISTRY_KEYS.LABELS_KEY, labels_key), CategoricalJointObsField(REGISTRY_KEYS.CAT_COVS_KEY, categorical_covariate_keys), NumericalJointObsField(REGISTRY_KEYS.CONT_COVS_KEY, continuous_covariate_keys), ] if protein_expression_obsm_key is not None: anndata_fields.append( ProteinObsmField( REGISTRY_KEYS.PROTEIN_EXP_KEY, protein_expression_obsm_key, use_batch_mask=True, batch_key=batch_field.attr_key, colnames_uns_key=protein_names_uns_key, is_count_data=True, )) adata_manager = AnnDataManager(fields=anndata_fields) adata_manager.register_fields(adata) return adata_manager
def _create_indices_adata_manager(adata: AnnData) -> AnnDataManager: # add index for each cell (provided to pyro plate for correct minibatching) adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64") anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, None, is_count_data=True), CategoricalObsField(REGISTRY_KEYS.LABELS_KEY, None), NumericalObsField(REGISTRY_KEYS.INDICES_KEY, "_indices"), ] adata_manager = AnnDataManager(fields=anndata_fields) adata_manager.register_fields(adata) return adata_manager
def setup_anndata( cls, adata: AnnData, **kwargs, ) -> Optional[AnnData]: setup_method_args = cls._get_setup_method_args(**locals()) anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, None, is_count_data=True), ] adata_manager = AnnDataManager(fields=anndata_fields, setup_method_args=setup_method_args) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)
def __init__( self, adata_manager: AnnDataManager, unlabeled_category, train_size: float = 0.9, validation_size: Optional[float] = None, n_samples_per_label: Optional[int] = None, use_gpu: bool = False, **kwargs, ): super().__init__() self.adata_manager = adata_manager self.unlabeled_category = unlabeled_category self.train_size = float(train_size) self.validation_size = validation_size self.data_loader_kwargs = kwargs self.n_samples_per_label = n_samples_per_label original_key = adata_manager.get_state_registry( REGISTRY_KEYS.LABELS_KEY ).original_key labels = np.asarray(adata_manager.adata.obs[original_key]).ravel() self._unlabeled_indices = np.argwhere(labels == unlabeled_category).ravel() self._labeled_indices = np.argwhere(labels != unlabeled_category).ravel() self.data_loader_kwargs = kwargs self.use_gpu = use_gpu
def cite_seq_raw_counts_properties( adata_manager: AnnDataManager, idx1: Union[List[int], np.ndarray], idx2: Union[List[int], np.ndarray], ) -> Dict[str, np.ndarray]: """ Computes and returns some statistics on the raw counts of two sub-populations. Parameters ---------- adata_manager :class:`~scvi.data.anndata.AnnDataManager` object setup with :class:`~scvi.model.TOTALVI`. idx1 subset of indices describing the first population. idx2 subset of indices describing the second population. Returns ------- type Dict of ``np.ndarray`` containing, by pair (one for each sub-population), mean expression per gene, proportion of non-zero expression per gene, mean of normalized expression. """ gp = scrna_raw_counts_properties(adata_manager, idx1, idx2) protein_exp = adata_manager.get_from_registry( REGISTRY_KEYS.PROTEIN_EXP_KEY) nan = np.array([np.nan] * adata_manager.summary_stats.n_proteins) protein_exp = adata_manager.get_from_registry( REGISTRY_KEYS.PROTEIN_EXP_KEY) mean1_pro = np.asarray(protein_exp[idx1].mean(0)) mean2_pro = np.asarray(protein_exp[idx2].mean(0)) nonz1_pro = np.asarray((protein_exp[idx1] > 0).mean(0)) nonz2_pro = np.asarray((protein_exp[idx2] > 0).mean(0)) properties = dict( raw_mean1=np.concatenate([gp["raw_mean1"], mean1_pro]), raw_mean2=np.concatenate([gp["raw_mean2"], mean2_pro]), non_zeros_proportion1=np.concatenate( [gp["non_zeros_proportion1"], nonz1_pro]), non_zeros_proportion2=np.concatenate( [gp["non_zeros_proportion2"], nonz2_pro]), raw_normalized_mean1=np.concatenate([gp["raw_normalized_mean1"], nan]), raw_normalized_mean2=np.concatenate([gp["raw_normalized_mean2"], nan]), ) return properties
def setup_anndata( cls, adata: AnnData, **kwargs, ) -> Optional[AnnData]: setup_method_args = cls._get_setup_method_args(**locals()) # add index for each cell (provided to pyro plate for correct minibatching) adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64") anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, None, is_count_data=True), CategoricalObsField(REGISTRY_KEYS.LABELS_KEY, None), NumericalObsField(REGISTRY_KEYS.INDICES_KEY, "_indices"), ] adata_manager = AnnDataManager(fields=anndata_fields, setup_method_args=setup_method_args) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)
def _init_library_size(adata_manager: AnnDataManager, n_batch: dict) -> Tuple[np.ndarray, np.ndarray]: """ Computes and returns library size. Parameters ---------- adata_manager :class:`~scvi.data.anndata.AnnDataManager` object setup with :class:`~scvi.model.SCVI`. n_batch Number of batches. Returns ------- type Tuple of two 1 x n_batch ``np.ndarray`` containing the means and variances of library size in each batch in adata. If a certain batch is not present in the adata, the mean defaults to 0, and the variance defaults to 1. These defaults are arbitrary placeholders which should not be used in any downstream computation. """ data = adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY) batch_indices = adata_manager.get_from_registry(REGISTRY_KEYS.BATCH_KEY) library_log_means = np.zeros(n_batch) library_log_vars = np.ones(n_batch) for i_batch in np.unique(batch_indices): idx_batch = np.squeeze(batch_indices == i_batch) batch_data = data[idx_batch.nonzero() [0]] # h5ad requires integer indexing arrays. sum_counts = batch_data.sum(axis=1) masked_log_sum = np.ma.log(sum_counts) if np.ma.is_masked(masked_log_sum): warnings.warn( "This dataset has some empty cells, this might fail inference." "Data should be filtered with `scanpy.pp.filter_cells()`") log_counts = masked_log_sum.filled(0) library_log_means[i_batch] = np.mean(log_counts).astype(np.float32) library_log_vars[i_batch] = np.var(log_counts).astype(np.float32) return library_log_means.reshape(1, -1), library_log_vars.reshape(1, -1)
def create_doublets( cls, adata_manager: AnnDataManager, doublet_ratio: int, indices: Optional[Sequence[int]] = None, seed: int = 1, ) -> AnnData: """Simulate doublets. Parameters ---------- adata AnnData object setup with setup_anndata. doublet_ratio Ratio of generated doublets to produce relative to number of cells in adata or length of indices, if not `None`. indices Indices of cells in adata to use. If `None`, all cells are used. seed Seed for reproducibility """ adata = adata_manager.adata n_obs = adata.n_obs if indices is None else len(indices) num_doublets = doublet_ratio * n_obs # counts can be in many locations, this uses where it was registered in setup x = adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY) if indices is not None: x = x[indices] random_state = np.random.RandomState(seed=seed) parent_inds = random_state.choice(n_obs, size=(num_doublets, 2)) doublets = x[parent_inds[:, 0]] + x[parent_inds[:, 1]] doublets_ad = AnnData(doublets) doublets_ad.var_names = adata.var_names doublets_ad.obs_names = [ "sim_doublet_{}".format(i) for i in range(num_doublets) ] # if adata setup with a layer, need to add layer to doublets adata layer = adata_manager.data_registry[REGISTRY_KEYS.X_KEY].attr_key if layer is not None: doublets_ad.layers[layer] = doublets return doublets_ad
def _get_batch_code_from_category(adata_manager: AnnDataManager, category: Sequence[Union[Number, str]]): if not isinstance(category, IterableClass) or isinstance(category, str): category = [category] batch_mappings = adata_manager.get_state_registry( REGISTRY_KEYS.BATCH_KEY).categorical_mapping batch_code = [] for cat in category: if cat is None: batch_code.append(None) elif cat not in batch_mappings: raise ValueError('"{}" not a valid batch category.'.format(cat)) else: batch_loc = np.where(batch_mappings == cat)[0][0] batch_code.append(batch_loc) return batch_code
def __init__( self, adata_manager: AnnDataManager, unlabeled_category: str, n_samples_per_label: Optional[int] = None, indices: Optional[List[int]] = None, shuffle: bool = False, batch_size: int = 128, data_and_attributes: Optional[dict] = None, drop_last: Union[bool, int] = False, **data_loader_kwargs, ): adata = adata_manager.adata if indices is None: indices = np.arange(adata.n_obs) self.indices = indices if len(indices) == 0: return None self.n_samples_per_label = n_samples_per_label labels_obs_key = adata_manager.get_state_registry( REGISTRY_KEYS.LABELS_KEY).original_key labels = np.asarray(adata_manager.adata.obs[labels_obs_key]).ravel() # save a nested list of the indices per labeled category self.labeled_locs = [] for label in np.unique(labels): if label != unlabeled_category: label_loc_idx = np.where(labels[indices] == label)[0] label_loc = indices[label_loc_idx] self.labeled_locs.append(label_loc) labelled_idx = self.subsample_labels() super().__init__( adata_manager=adata_manager, indices_list=[indices, labelled_idx], shuffle=shuffle, batch_size=batch_size, data_and_attributes=data_and_attributes, drop_last=drop_last, **data_loader_kwargs, )
def scatac_raw_counts_properties( adata_manager: AnnDataManager, idx1: Union[List[int], np.ndarray], idx2: Union[List[int], np.ndarray], var_idx: Optional[Union[List[int], np.ndarray]] = None, ) -> Dict[str, np.ndarray]: """ Computes and returns some statistics on the raw counts of two sub-populations. Parameters ---------- adata_manager :class:`~scvi.data.anndata.AnnDataManager` object setup with :class:`~scvi.model.SCVI`. idx1 subset of indices describing the first population. idx2 subset of indices describing the second population. var_idx subset of variables to extract properties from. if None, all variables are used. Returns ------- type Dict of ``np.ndarray`` containing, by pair (one for each sub-population). """ data = adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY) data1 = data[idx1] data2 = data[idx2] if var_idx is not None: data1 = data1[:, var_idx] data2 = data2[:, var_idx] mean1 = np.asarray((data1 > 0).mean(axis=0)).ravel() mean2 = np.asarray((data2 > 0).mean(axis=0)).ravel() properties = dict(emp_mean1=mean1, emp_mean2=mean2, emp_effect=(mean1 - mean2)) return properties
def scrna_raw_counts_properties( adata_manager: AnnDataManager, idx1: Union[List[int], np.ndarray], idx2: Union[List[int], np.ndarray], var_idx: Optional[Union[List[int], np.ndarray]] = None, ) -> Dict[str, np.ndarray]: """ Computes and returns some statistics on the raw counts of two sub-populations. Parameters ---------- adata_manager :class:`~scvi.data.anndata.AnnDataManager` object setup with :class:`~scvi.model.SCVI`. idx1 subset of indices describing the first population. idx2 subset of indices describing the second population. var_idx subset of variables to extract properties from. if None, all variables are used. Returns ------- type Dict of ``np.ndarray`` containing, by pair (one for each sub-population), mean expression per gene, proportion of non-zero expression per gene, mean of normalized expression. """ adata = adata_manager.adata data = adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY) data1 = data[idx1] data2 = data[idx2] if var_idx is not None: data1 = data1[:, var_idx] data2 = data2[:, var_idx] mean1 = np.asarray((data1).mean(axis=0)).ravel() mean2 = np.asarray((data2).mean(axis=0)).ravel() nonz1 = np.asarray((data1 != 0).mean(axis=0)).ravel() nonz2 = np.asarray((data2 != 0).mean(axis=0)).ravel() key = "_scvi_raw_norm_scaling" if key not in adata.obs.keys(): scaling_factor = 1 / np.asarray(data.sum(axis=1)).ravel().reshape( -1, 1) scaling_factor *= 1e4 adata.obs[key] = scaling_factor.ravel() else: scaling_factor = adata.obs[key].to_numpy().ravel().reshape(-1, 1) if issubclass(type(data), sp_sparse.spmatrix): norm_data1 = data1.multiply(scaling_factor[idx1]) norm_data2 = data2.multiply(scaling_factor[idx2]) else: norm_data1 = data1 * scaling_factor[idx1] norm_data2 = data2 * scaling_factor[idx2] norm_mean1 = np.asarray(norm_data1.mean(axis=0)).ravel() norm_mean2 = np.asarray(norm_data2.mean(axis=0)).ravel() properties = dict( raw_mean1=mean1, raw_mean2=mean2, non_zeros_proportion1=nonz1, non_zeros_proportion2=nonz2, raw_normalized_mean1=norm_mean1, raw_normalized_mean2=norm_mean2, ) return properties
def setup_anndata( cls, adata: AnnData, protein_expression_obsm_key: str, protein_names_uns_key: Optional[str] = None, batch_key: Optional[str] = None, layer: Optional[str] = None, size_factor_key: Optional[str] = None, categorical_covariate_keys: Optional[List[str]] = None, continuous_covariate_keys: Optional[List[str]] = None, **kwargs, ) -> Optional[AnnData]: """ %(summary)s. Parameters ---------- %(param_adata)s protein_expression_obsm_key key in `adata.obsm` for protein expression data. protein_names_uns_key key in `adata.uns` for protein names. If None, will use the column names of `adata.obsm[protein_expression_obsm_key]` if it is a DataFrame, else will assign sequential names to proteins. %(param_batch_key)s %(param_layer)s %(param_size_factor_key)s %(param_cat_cov_keys)s %(param_cont_cov_keys)s %(param_copy)s Returns ------- %(returns)s """ setup_method_args = cls._get_setup_method_args(**locals()) batch_field = CategoricalObsField(REGISTRY_KEYS.BATCH_KEY, batch_key) anndata_fields = [ LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True), CategoricalObsField( REGISTRY_KEYS.LABELS_KEY, None), # Default labels field for compatibility with TOTALVAE batch_field, NumericalObsField(REGISTRY_KEYS.SIZE_FACTOR_KEY, size_factor_key, required=False), CategoricalJointObsField(REGISTRY_KEYS.CAT_COVS_KEY, categorical_covariate_keys), NumericalJointObsField(REGISTRY_KEYS.CONT_COVS_KEY, continuous_covariate_keys), ProteinObsmField( REGISTRY_KEYS.PROTEIN_EXP_KEY, protein_expression_obsm_key, use_batch_mask=True, batch_key=batch_field.attr_key, colnames_uns_key=protein_names_uns_key, is_count_data=True, ), ] adata_manager = AnnDataManager(fields=anndata_fields, setup_method_args=setup_method_args) adata_manager.register_fields(adata, **kwargs) cls.register_manager(adata_manager)