def test_dataframe_reserved_columns(tmp_path, diskfmt): reserved = ("_index", "__categories") adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData(X=np.ones((5, 5))) for colname in reserved: to_write = orig.copy() to_write.obs[colname] = np.ones(5) with pytest.raises(ValueError) as e: getattr(to_write, f"write_{diskfmt}")(adata_pth) assert colname in str(e.value) for colname in reserved: to_write = orig.copy() to_write.varm["df"] = pd.DataFrame({colname: list("aabcd")}, index=to_write.var_names) with pytest.raises(ValueError) as e: getattr(to_write, f"write_{diskfmt}")(adata_pth) assert colname in str(e.value)
def __prep_predict_data(self, test_data): missing = set(self.genes).difference(test_data.var_names) if len(missing) > 0: data = pd.concat([ pd.DataFrame(test_data.X, index=test_data.obs_names, columns=test_data.var_names), pd.DataFrame(0, index=test_data.obs_names, columns=missing) ], axis=1) data = data[list(self.genes)] data_sc = anndata.AnnData(X=data.to_numpy()) data_sc.var_names = data.columns data_sc.obs_names = data.index return data_sc else: return test_data
def main(): options = get_options() if not options.inverse_transform: loader = np.load(options.sparse) try: count_matrix = sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) except ValueError: count_matrix = sp.csc_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) bc_list = loader['bc_list'] if count_matrix.shape[1] == len(bc_list): # we probably have data in regions by cells, we need it the other way count_matrix = sp.csr_matrix(count_matrix.T) #convert if options.keep_all: mask = np.ones(count_matrix.shape[1], dtype=bool) else: mask = np.array(np.sum(count_matrix, axis=0) > 0).ravel() if options.peaks_file: regions = [] for line in open(options.peaks_file): t = line.split() chrom, start, end = t[:3] r_id = "%s:%s-%s" % (chrom, start, end) regions.append(r_id) regions = np.array(regions) else: regions = np.arange(count_matrix.shape[1]) regions = regions[mask] count_matrix = count_matrix[:, mask] n_cells = pd.DataFrame(np.array(np.sum(count_matrix > 0, axis=0)).ravel(), index=regions, columns=['n_cells']) n_regions = pd.DataFrame(np.array(np.sum(count_matrix > 0, axis=1)).ravel(), index=bc_list, columns=['n_regions']) adata = anndata.AnnData(count_matrix, obs=n_regions, var=n_cells) adata.write(options.anndata) else: adata = anndata.read(options.anndata) count_matrix = adata.X bc_list = np.array(adata.obs.index) np.savez(options.sparse, data = count_matrix.data, indices=count_matrix.indices, indptr=count_matrix.indptr, shape=count_matrix.shape, bc_list=bc_list)
def read_expression_from_archive(archive: ZipFile) -> anndata.AnnData: info = archive.infolist() assert len(info) == 3 mtx_data_info = next(i for i in info if i.filename.endswith(".mtx")) mtx_rows_info = next(i for i in info if i.filename.endswith(".mtx_rows")) mtx_cols_info = next(i for i in info if i.filename.endswith(".mtx_cols")) with archive.open(mtx_data_info, "r") as f: expr = read_mtx_from_stream(f) with archive.open(mtx_rows_info, "r") as f: # TODO: Check what other value could be varname = pd.read_csv(f, sep="\t", header=None)[1] with archive.open(mtx_cols_info, "r") as f: obsname = pd.read_csv(f, sep="\t", header=None).iloc[:, 0] adata = anndata.AnnData(expr) adata.var_names = varname adata.obs_names = obsname return adata
def test_simple(): tree_data, tree_clusters = phate.tree.gen_dla(n_branch=3) phate_operator = phate.PHATE(k=15, t=100) tree_phate = phate_operator.fit_transform(tree_data) assert tree_phate.shape == (tree_data.shape[0], 2) clusters = phate.cluster.kmeans(phate_operator, k=3) assert np.issubdtype(clusters.dtype, int) assert len(clusters.shape) == 1 assert len(clusters) == tree_data.shape[0] phate_operator.fit(phate_operator.graph) G = graphtools.Graph(phate_operator.graph.kernel, precomputed='affinity', use_pygsp=True) phate_operator.fit(G) G = pygsp.graphs.Graph(G.W) phate_operator.fit(G) phate_operator.fit(anndata.AnnData(tree_data))
def _compute_neighbors(self): # nearest neighbors graph adata = anndata.AnnData( X=None, obs=pd.DataFrame([], index=[f'obs{i}' for i in range(self.n_obs)]), var=pd.DataFrame([], index=[f'var{i}' for i in range(self.n_pcs)])) adata.obsm['X_pca'] = self.X # here neighbors should only use PCs self._neighbors = Neighbors(adata=adata) self._neighbors.compute_neighbors(n_neighbors=self.n_neighbors, knn=True, n_pcs=self.n_pcs, use_rep='X_pca', method='umap', metric=self.knn_metric, random_state=self.random_state) return
def main(args): tmap_model = wot.tmap.TransportMapModel.from_directory(args.tmap) cell_sets_matrix = wot.io.read_sets(args.cell_set) cell_sets = wot.io.convert_binary_dataset_to_dict(cell_sets_matrix) populations = tmap_model.population_from_cell_sets(cell_sets, at_time=args.day) timepoints, census = tmap_model.ancestor_census(cell_sets_matrix, *populations) obs = pd.DataFrame(index=timepoints) for i in range(len(census)): res = anndata.AnnData(census[i], obs, cell_sets_matrix.var) wot.io.write_dataset(res, args.out + '_' + populations[i].name, output_format='txt')
def get_cellbench(): protocols = ['10x', 'CELseq2', 'Dropseq'] adatas = [] for protocol in protocols: #print(protocol) counts = pd.read_csv('data/CellBench/{}_counts.csv'.format(protocol), index_col=0).T counts = counts.loc[:, ~counts.columns.duplicated()] meta = pd.read_csv('data/CellBench/{}_meta.csv'.format(protocol), index_col=0) counts, meta = preprocessing.remove_doublets(counts, meta) counts, meta = preprocessing.clean_counts(counts, meta, FILTER_MIN_GENES, FILTER_MIN_READS, FILTER_MIN_DETECTED) adatas.append(anndata.AnnData(X=counts.values, obs=meta, var=pd.DataFrame(index=counts.columns))) # print(adatas[-1].shape) # print(np.unique(adatas[-1].obs['cell_line_demuxlet'])) adata = anndata.AnnData.concatenate(*adatas, join='inner', batch_key='protocol', batch_categories=protocols) # print(adata.X.shape) # print(adata.obs.info()) return adata
def test_modify_view_component(matrix_type, mapping_name): adata = ad.AnnData( np.zeros((10, 10)), **{mapping_name: { "m": matrix_type(asarray(sparse.random(10, 10))) }}, ) init_hash = joblib.hash(adata) subset = adata[:5, :][:, :5] assert subset.isview m = getattr(subset, mapping_name)["m"] m[0, 0] = 100 assert not subset.isview assert getattr(subset, mapping_name)["m"][0, 0] == 100 assert init_hash == joblib.hash(adata)
def adata(): return ad.AnnData( np.zeros((20, 10)), obs=pd.DataFrame( {"obs_key": list(ascii_letters[:20])}, index=[f"cell{i}" for i in range(20)], ), var=pd.DataFrame( {"var_key": np.arange(10)}, index=[f"gene{i}" for i in range(10)] ), varm={"varm_key": np.zeros((10, 20))}, obsm={"obsm_key": np.zeros((20, 20))}, layers={"layers_key": np.zeros((20, 10))}, obsp={"obsp_key": np.zeros((20, 20))}, varp={"varp_key": np.zeros((10, 10))}, uns={"uns_key": dict(zip("abc", range(3)))}, )
def calculateExpressionRatio(adata, clusterby): """ 逐个计算adata中每个基因在每个cluster中的表达比例 adata: 需要含有raw clusterby: adata.obs中的某个列名 """ transformAdataRawToAd = lambda adata: anndata.AnnData( X=adata.raw.X, obs=adata.obs, var=adata.raw.var) rawAd = transformAdataRawToAd(adata) expressionOrNotdf = (rawAd.to_df() > 0).astype(int) expressionOrNotdf[clusterby] = rawAd.obs[clusterby] expressionRatioDf = expressionOrNotdf.groupby(clusterby).agg( "sum") / expressionOrNotdf.groupby(clusterby).agg("count") return expressionRatioDf
def predict(self, adata, encoder_labels, decoder_labels, return_adata=True): """ Predicts the cell type provided by the user in stimulated condition. # Parameters data: `~anndata.AnnData` Annotated data matrix whether in primary space. labels: numpy nd-array `numpy nd-array` of labels to be fed as CVAE's condition array. # Returns stim_pred: numpy nd-array `numpy nd-array` of predicted cells in primary space. # Example ```python import scanpy as sc import scgen train_data = sc.read("train_kang.h5ad") validation_data = sc.read("./data/validation.h5ad") network = scgen.CVAE(train_data=train_data, use_validation=True, validation_data=validation_data, model_path="./saved_models/", conditions={"ctrl": "control", "stim": "stimulated"}) network.scripts(n_epochs=20) prediction = network.predict('CD4T', obs_key={"cell_type": ["CD8T", "NK"]}) ``` """ adata = remove_sparsity(adata) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) reconstructed = self.cvae_model.predict( [adata.X, encoder_labels, decoder_labels])[0] reconstructed = np.nan_to_num(reconstructed) if return_adata: output = anndata.AnnData(X=reconstructed) output.obs = adata.obs.copy(deep=True) output.var_names = adata.var_names else: output = reconstructed return output
def ttest(filename: str): import anndata import diffxpy.api as de df = pandas.read_csv(filename, header=0, index_col=0).transpose() groupings = find_groupings(df, default_group='arpc') LOGGER.info( "group, filename=%s, groups=%d, groupings=%s", filename, len(groupings), [(x[0], y[0]) for x, y in groupings], ) for grouping in groupings: tag_1, indices_1 = grouping[0] tag_2, indices_2 = grouping[1] indices_1 = set(indices_1) indices_2 = set(indices_2) remained_df = df.drop(index=[ x for x in df.index if (x not in indices_1 and x not in indices_2) ]) data = anndata.AnnData(remained_df) new_grouping = [ f"1-{tag_1}" if x in indices_1 else f"2-{tag_2}" for x in data.obs.index.tolist() ] test = de.test.t_test(data, new_grouping) summary_output_file = get_output_filename(filename, f".{tag_1}_{tag_2}.out.csv") test.summary().to_csv(summary_output_file) LOGGER.info("summary saved, output=%s", summary_output_file) volcano_output_file = get_output_filename( filename, f".{tag_1}_{tag_2}.volcano.jpg") test.plot_volcano( corrected_pval=True, alpha=0.05, size=20, show=False, save=volcano_output_file, # highlight_ids=["NPPA"], ) LOGGER.info("volcano saved, output=%s", volcano_output_file)
def clusterGini(adataSC, **kwargs): """ Cluster cell based on Gini Index value. Params ------ adata: Anndata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. neighbors: int, optional (Default=5) The size of local neighborhood used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. For rare cell identification this values should be in the range 2 to 15. Recommended neighbors = 5. resolution: float, optional (Default=0.1) A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. method: string, optional (Default: 'leiden') method='louvain' or method='leiden'. Returns ------- Returns dictionary with gini cluster result. adata.var['rare'] """ cluster_neighbors = kwargs.get('neighbors', 5) cluster_resolution = kwargs.get('resolution', 0.1) cluster_method = kwargs.get('method', "leiden") if (cluster_method != "louvain" and cluster_method != "leiden"): raise SystemExit( "Only leiden or louvain cluster method is allowed in this step.") adataGini = adataSC[:, adataSC.var['gini']] scaleMatrix = arctanTransform((adataGini.X)) adataScaleGini = anndata.AnnData(X=scaleMatrix) ###calculate neighbor and clustering### sc.pp.neighbors(adataScaleGini, use_rep='X', n_neighbors=cluster_neighbors) giniClust = [] if (cluster_method == "louvain"): sc.tl.louvain(adataScaleGini, resolution=cluster_resolution) giniClust = adataScaleGini.obs['louvain'].values.tolist() else: sc.tl.leiden(adataScaleGini, resolution=cluster_resolution) giniClust = adataScaleGini.obs['leiden'].values.tolist() adataSC.obs['rare'] = giniClust return (adataScaleGini)
def to_anndata( self, cell_properties: Union[bool, Sequence[str]] = False, cell_channel_properties: Union[bool, Sequence[str]] = False, x_cell_channel_property: Optional[str] = None ) -> 'anndata.AnnData': """Returns an :class:`anndata.AnnData` representation of the current instance :param cell_properties: list of cell properties (e.g. regionprops) to include; set to ``True`` to include all :param cell_channel_properties: list of cell channel properties (e.g. intensity values) to include; set to ``True`` to include all :param x_cell_channel_property: cell channel property to use for the main AnnData data matrix (X) :return: AnnData object, in which cell channel properties (e.g. intensity values) are stored as layers and cell properties (e.g. regionprops) are stored as observations """ if anndata is None: raise RuntimeError('anndata is not installed') obs_data = None if cell_properties: cell_property_dataset = self.to_dataset( cell_properties=cell_properties) obs_data = utils.to_table( xr.concat(cell_property_dataset.data_vars.values(), 'property')) layers = {} if cell_channel_properties: cell_channel_property_dataset = self.to_dataset( cell_channel_properties=cell_channel_properties) layers = { property_name: da.values for property_name, da in cell_channel_property_dataset.data_vars.items() } return anndata.AnnData( X=getattr(self, x_cell_channel_property).values if x_cell_channel_property is not None else None, obs=pd.DataFrame(index=pd.Index(data=self.cell_ids.astype(str), name='cell'), data=obs_data), var=pd.DataFrame( index=pd.Index(data=self.channel_names, name='channel')), layers=layers or None, shape=(self.num_cells, self.num_channels) if x_cell_channel_property is None else None, )
def test_readwrite_loom(typ, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.obsm['X_a'] = np.zeros((adata_src.n_obs, 2)) adata_src.varm['X_b'] = np.zeros((adata_src.n_vars, 3)) adata_src.write_loom(tmp_path / 'test.loom', write_obsm_varm=True) adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix, cleanup=True) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray()) assert 'X_a' in adata.obsm_keys() and adata.obsm['X_a'].shape[1] == 2 assert 'X_b' in adata.varm_keys() and adata.varm['X_b'].shape[1] == 3 # as we called with `cleanup=True` assert 'oanno1b' in adata.uns['loom-obs'] assert 'vanno2' in adata.uns['loom-var']
def predict(self, adata, target_label, condition_key, cell_type_key, cell_type_to_predict, source_condition, target_condition): adata = remove_sparsity(adata) cell_type_adata = adata[adata.obs[cell_type_key] == cell_type_to_predict] source_adata = cell_type_adata[cell_type_adata.obs[condition_key] == source_condition] y_test = np.zeros(source_adata.shape[0]) + target_label real_loader = Loader(source_adata.X, labels=y_test, shuffle=False) pred = self.model_backend.get_reconstruction(real_loader) pred = np.nan_to_num(pred[0]) pred_adata = anndata.AnnData(X=pred) pred_adata.obs[condition_key] = f"{cell_type_to_predict}_pred_{target_condition}" pred_adata.var_names = adata.var_names return pred_adata
def read_grp(path, feature_ids=None): elements = [] with open(path) as fp: for line in fp: line = line.strip() if line == '' or line[0] == '#' or line[0] == '>': continue elements.append(line.lower()) if feature_ids is None: feature_ids = list(sorted(elements)) x = np.zeros((len(feature_ids), 1), dtype=np.int8) for i in range(len(feature_ids)): if feature_ids[i] in elements: x[i, 0] = 1 set_name, _ = get_filename_and_extension(os.path.basename(path)) obs = pd.DataFrame(index=feature_ids) var = pd.DataFrame(index=[set_name]) return anndata.AnnData(X=x, obs=obs, var=var)
def louvain_clustering(self, res=1): if (not self.analysis_performed): print("Please run the SAM analysis first using 'run' after " "loading the data.") else: import anndata import scanpy.api as sc adata = anndata.AnnData(self.D, var={'genes': self.gene_names}, obs={'cells': self.cell_names}) adata.obsm['X_pca'] = self.wPCA_data sc.pp.neighbors(adata, n_neighbors=self.k, metric='correlation', method='umap') sc.tl.louvain(adata, resolution=res) self.cluster_labels = adata.obs['louvain'].values.astype('int') self.output_vars['louvain_cluster_labels'] = self.cluster_labels
def my_read_hdf(path): f = h5py.File(path, 'r') group_GRCh38 = f['/GRCh38'] cell_names = group_GRCh38['barcodes'] mat_shape = group_GRCh38['shape'] indptr = np.array(group_GRCh38['indptr']) indices = np.array(group_GRCh38['indices']) data = group_GRCh38['data'] genes = group_GRCh38['genes'] X = csr_matrix((data, indices, indptr), shape=(mat_shape[1], mat_shape[0])).toarray() print("aaaa") adata = anndata.AnnData(X, pd.DataFrame(index=cell_names.value), pd.DataFrame(index=genes.value), dtype=X.dtype.name) print("bbb") return adata
def cluster_features(features: pd.DataFrame, like=None): """Calculate leiden clustering of features. Specify filter of features using `like`. """ # filter features if like is not None: features = features.filter(like=like) # create temporary adata to calculate the clustering adata = ad.AnnData(features) # important - feature values are not scaled, so need to scale them before PCA sc.pp.scale(adata) # calculate leiden clustering sc.pp.pca(adata, n_comps=min(10, features.shape[1] - 1)) sc.pp.neighbors(adata) sc.tl.leiden(adata) return adata.obs["leiden"]
def test_double_index(): X = np.array(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict, dtype='int32') adata.filename = './test.h5ad' from pytest import raises with raises(ValueError): # no view of view of backed object currently adata[:2][:, 0] # close backing file adata.write()
def merge_datasets(*args): datasets = list(args) merged_x = np.concatenate([d.X for d in datasets]) row_columns = set(datasets[0].obs.columns) if not all([set(d.obs.columns) == row_columns for d in datasets]): raise ValueError( "Unable to merge: incompatible metadata between datasets") merged_row_meta = pd.concat([d.obs for d in datasets], sort=True) if merged_row_meta.index.duplicated().any(): raise ValueError( "Unable to merge: duplicate rows between datasets, cannot lose information" ) col_index = datasets[0].var.index if not all([d.var.index.equals(col_index) for d in datasets]): raise ValueError( "Unable to merge: incompatible genes between datasets") merged_col_meta = datasets[0].var return anndata.AnnData(merged_x, merged_row_meta, merged_col_meta)
def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): tmpdir = Path(tmp_path) pth1 = tmpdir / f"first.{diskfmt}" write1 = lambda x: getattr(x, f"write_{diskfmt}")(pth1) read1 = lambda: getattr(ad, f"read_{diskfmt}")(pth1) pth2 = tmpdir / f"second.{diskfmt2}" write2 = lambda x: getattr(x, f"write_{diskfmt2}")(pth2) read2 = lambda: getattr(ad, f"read_{diskfmt2}")(pth2) adata1 = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict) write1(adata1) adata2 = read1() write2(adata2) adata3 = read2() assert_equal(adata2, adata1) assert_equal(adata3, adata1) assert_equal(adata2, adata1)
def creatAnndataFromDf(df, **layerInfoDt): """ dataframe转换成anndata df, layerInfoDt: key为layer名 value为mtx 均行为barcode 列为feature 维度相同 """ transformedAd = anndata.AnnData( X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns), ) for layerName, layerMtx in layerInfoDt.items(): transformedAd.layers[layerName] = layerMtx return transformedAd
def trajectories(self, populations): """ Computes a trajectory for each population Parameters ---------- self : wot.TransportMapModel The TransportMapModel used to find ancestors and descendants of the population populations : list of wot.Population The target populations such as ones from self.population_from_cell_sets. THe populations must be from the same time. Returns ------- trajectories : anndata.AnnData Rows : all cells, Columns : populations index. At point (i, j) : the probability that cell i is an ancestor/descendant of population j """ wot.tmap.unique_timepoint(*populations) # check for unique timepoint trajectories = [] populations = Population.copy(*populations, normalize=True, add_missing=False) population_names = [p.name for p in populations] initial_populations = populations def update(head, populations_to_update): idx = 0 if head else len(trajectories) trajectories.insert( idx, np.array([pop.p for pop in populations_to_update]).T) update(True, populations) while self.can_pull_back(*populations): populations = self.pull_back(*populations, as_list=True) update(True, populations) populations = initial_populations while self.can_push_forward(*populations): populations = self.push_forward(*populations, as_list=True) update(False, populations) return anndata.AnnData(X=np.concatenate(trajectories), obs=self.meta.copy(), var=pd.DataFrame(index=population_names))
def from_scanpy_dir(path, cell_type_identifier, covariate_key): """ Creates a compositional analysis data set from all scanpy data sets in a directory. To use this function, all data sets need to have one common column in adata.obs that contans the cell type assignment. Also, the covariates need to be stored under the same key in adata.uns Usage: data = from_scanpy("./path/to/directory", cell_type_identifier="Louvain", covariate_key="covariates") Parameters ---------- path -- str path to directory cell_type_identifier -- str column name in adata.obs that specifies the cell types covariate_key -- str key for adata.uns, where the covariate values are stored Returns ------- data -- CompositionalData object A compositional analysis data set """ count_data = pd.DataFrame() covariate_data = pd.DataFrame() filenames = os.listdir(path) for f in filenames: adata = ad.read_h5ad(f) cell_counts, covs = from_scanpy(adata, cell_type_identifier, covariate_key) count_data = count_data.append(cell_counts, ignore_index=True) covariate_data = covariate_data.append(pd.Series(covs), ignore_index=True) # Replace NaNs count_data = count_data.fillna(0) covariate_data = covariate_data.fillna(0) return ad.AnnData(X=count_data.values, var=count_data.sum(axis=0).rename("n_cells").to_frame(), obs=covariate_data)
def to_mmd_layer(self, adata, encoder_labels, feed_fake=0, return_adata=True): """ Map `adata` in to the MMD layer of trVAE network. This function will compute output activation of MMD layer in trVAE. # Parameters adata: `~anndata.AnnData` Annotated data matrix to be mapped to latent space. `data.X` has to be in shape [n_obs, n_vars]. encoder_labels: numpy nd-array `numpy nd-array` of labels to be fed as CVAE's condition array. feed_fake: int if `feed_fake` is non-negative, `decoder_labels` will be identical to `encoder_labels`. if `feed_fake` is not non-negative, `decoder_labels` will be fed with `feed_fake` value. return_adata: boolean if `True`, will output as an `anndata` object or put the results in the `obsm` attribute of `adata` # Returns output: `~anndata.AnnData` returns `anndata` object containing MMD latent space encoding of 'adata' """ if feed_fake >= 0: decoder_labels = np.zeros(shape=encoder_labels.shape) + feed_fake else: decoder_labels = encoder_labels encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) adata = remove_sparsity(adata) x = [adata.X, encoder_labels, decoder_labels] mmd_latent = self.cvae_model.predict(x)[1] mmd_latent = np.nan_to_num(mmd_latent) if return_adata: output = anndata.AnnData(X=mmd_latent) output.obs = adata.obs.copy(deep=True) else: output = mmd_latent return output
def convert_to_same_format(data, target_data, columns=None, prevent_sparse=False): """Convert data to same format as target data.""" # create new data object if scprep.utils.is_sparse_dataframe(target_data): if prevent_sparse: data = pd.DataFrame(data) else: data = scprep.utils.SparseDataFrame(data) pandas = True elif isinstance(target_data, pd.DataFrame): data = pd.DataFrame(data) pandas = True elif is_anndata(target_data): data = anndata.AnnData(data) pandas = False else: # nothing to do return data # retrieve column names target_columns = target_data.columns if pandas else target_data.var # subset column names try: if columns is not None: if pandas: target_columns = target_columns[columns] else: target_columns = target_columns.iloc[columns] except (KeyError, IndexError, ValueError): # keep the original column names if pandas: target_columns = columns else: target_columns = pd.DataFrame(index=columns) # set column names on new data object if pandas: data.columns = target_columns data.index = target_data.index else: data.var = target_columns data.obs = target_data.obs return data
def generate_normal_uncorrelated(N, D, K, n_total, noise_std_true=1): """ Scenario 1: Normally distributed, independent covariates Parameters ---------- N -- int Number of samples D -- int Number of covariates K -- int Number of cell types n_total -- list Number of individual cells per sample noise_std_true -- float noise level. 0: No noise Returns ------- data Anndata object """ # Generate random composition parameters b_true = np.random.normal(0, 1, size=K).astype(np.float32) # bias (alpha) w_true = np.random.normal(0, 1, size=(D, K)).astype(np.float32) # weights (beta) # Generate random covariate matrix x = np.random.normal(0, 1, size=(N, D)).astype(np.float32) noise = noise_std_true * np.random.randn(N, 1).astype(np.float32) # Generate y y = np.zeros([N, K], dtype=np.float32) for i in range(N): # Concentration should sum to 1 for each sample concentration = softmax(x[i, :].T@w_true + b_true + noise[i, :]).astype(np.float32) y[i, :] = np.random.multinomial(n_total[i], concentration).astype(np.float32) x_names = ["x_" + str(n) for n in range(x.shape[1])] x_df = pd.DataFrame(x, columns=x_names) data = ad.AnnData(X=y, obs=x_df, uns={"b_true": b_true, "w_true": w_true}) return data