def _read(filename, backed=False, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, suppress_cache_warning=False): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' + avail_exts) else: ext = is_valid_filename(filename, return_ext=True) is_present = check_datafile_present_and_download(filename, backup_url=backup_url) if not is_present: logg.msg('... did not find original file', filename) # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.msg('reading sheet', sheet, 'from file', filename, v=4) return read_hdf(filename, sheet) # read other file types filename_cache = (settings.cachedir + filename.lstrip( './').replace('/', '-').replace('.' + ext, '.h5ad')) if cache and os.path.exists(filename_cache): logg.info('... reading from cache file', filename_cache) adata = read_h5ad(filename_cache, backed=False) else: if not is_present: raise FileNotFoundError('Did not find file {}.'.format(filename)) logg.msg('reading', filename, v=4) if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( 'Provide `sheet` parameter when reading \'.xlsx\' files.') else: adata = read_excel(filename, sheet) elif ext == 'mtx': adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.msg('... assuming \'.data\' means tab or white-space ' 'separated text file', v=3) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) else: raise ValueError('Unkown extension {}.'.format(ext)) if cache: logg.info('... writing an', settings.file_format_data, 'cache file to speedup reading next time') if not os.path.exists(os.path.dirname(filename_cache)): os.makedirs(os.path.dirname(filename_cache)) # write for faster reading when calling the next time adata.write(filename_cache) return adata
def setUp(self): self.adata = ad.read_csv(input_file) # regular anndata input_file_zarr = tmp_dir() self.adata.write_zarr( input_file_zarr, chunks=(2, 5)) # write as zarr, so we can read using a RDD self.adata_rdd = AnnDataRdd.from_zarr(self.sc, input_file_zarr)
def load_file(filepath): if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': # TODO remove transpose adata = anndata.read_csv(filepath).T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[ -4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") adata.uns['dataset'] = dataset return adata
def from_csv(cls, sc, csv_file, chunk_size): """ Read a CSV file as an anndata object (for the metadata) and with the data matrix (X) as an RDD of numpy arrays. *Note* the anndata object currently also stores the data matrix, which is redundant and won't scale. This should be improved, possibly by changing anndata. """ adata = ad.read_csv(csv_file) return cls._from_anndata(sc, adata, chunk_size, read_chunk_csv(csv_file, chunk_size))
def load_mouseAllen_10X_data(data_dir, ind=None, celltype_gran=0, curate=False): '''Load mouse Allen cortex data from 10X reads @ind: individual sample -> 49 mice originally, 3 remained after filtering on male @celltype_gran: major cell types (0) or sub-cell types (1) @curate: whether to curate to the same name as mouse FC, mouse pFC, mouse protocol ''' adata = anndata.read_csv(data_dir+os.sep+'AllenBrain/filtered_matrix.csv') adata.var["gene_symboles"] = adata.var.index adata.var_names_make_unique(join="-") adata.obs['barcode'] = adata.obs.index adata.obs_names_make_unique(join="-") ## load metadata meta_df = pd.read_csv(data_dir+os.sep+'AllenBrain/filtered_metadata.csv', index_col=0) meta_df['external_donor_name_label'] = meta_df['external_donor_name_label'].astype(str) sample_info = pd.read_csv(data_dir+os.sep+'AllenBrain/sample_info.csv', index_col=0) ## use to filter gender merged_meta_df = meta_df.merge(sample_info, how='left', left_on="external_donor_name_label", right_on="external_donor_name") adata.obs = adata.obs.merge(merged_meta_df, left_index=True, right_on="sample_name", how='left') ## merge anndata with metadata adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None adata = adata[adata.obs['sex_label'] == 'M'] ## filter males out if 0 == celltype_gran: if curate: adata_obs = adata.obs adata_obs['cell.type'].replace(['L2 IT HATA', 'L2 IT RSP-ACA', 'L2/3 IT APr', 'L2/3 IT CTX', 'L2/3 IT PPP', 'L3 RSP-ACA', 'L4/5 IT CTX', 'L5 IT CTX', 'L5 NP CTX', 'L5 PT CTX', 'L5 PT RSP-ACA', 'L5/6 IT CTX', 'L6 CT CTX', 'L6 Car3', 'L6 IT CTX', 'L6 NP CT CTX', 'L6b CTX','NP PPP', 'NP SUB', 'CA2-IG-FC', 'L6b RHP', 'L6 IT RHP', 'L2/3 IT ENTl', 'L2 IT RSPv', 'L5 IT TPE-ENT', 'L2/3 IT TPE', 'L2 IT ProS'], 'Neuron', inplace=True) ## 27 types adata_obs['cell.type'].replace(['Sncg', 'Pvalb', 'Pvalb Vipr2', 'Sst', 'Sst Chodl', 'Vip', 'Lamp5', 'Lamp5 Lhx6', 'Pax6', 'Ndnf HPF'], 'Interneuron', inplace=True) ## 10 types adata_obs['cell.type'].replace(['Astro'], 'Astrocytes', inplace=True) adata_obs['cell.type'].replace(['Endo'], 'Endothelial', inplace=True) adata_obs['cell.type'].replace(['Micro'], 'Microglia', inplace=True) adata_obs['cell.type'].replace(['OPC'], 'Polydendrocytes', inplace=True) adata_obs['cell.type'].replace(['Oligo'], 'Oligodendrocytes', inplace=True) adata_obs['cell.type'].replace(['Peri'], 'Pericytes', inplace=True) adata.obs = adata_obs ## 'CR', 'PVM', 'SMC', 'VLMC' celltype are out of all other database, therefore not included remained_idx = ~adata.obs['cell.type'].isin(['CR', 'PVM', 'SMC', 'VLMC']) adata = adata[remained_idx] if 1 == celltype_gran: ## for sub-celltypes adata.obs.rename(columns={"cell.type": "major_celltype"}, inplace=True) adata.obs.rename(columns={"cell_type_alias_label": "cell.type"}, inplace=True) if ind is not None: ind_cells = adata.obs[adata.obs["external_donor_name_label"].isin(ind.split('_'))].index adata = adata[ind_cells] return adata
def load_mouseAllen_SS_data(data_dir, ind=None, celltype_gran=0, curate=False): '''Load mouse Allen cortex data from Smart-seq exon reads @ind: individual sample -> 529 mice originally, 19 mice after filtering @celltype_gran: major cell types (0) or sub-cell types (1) @curate: whether to curate to the same name as mouse FC, mouse pFC, mouse protocol ''' adata = anndata.read_csv(data_dir+os.sep+'AllenBrain/filtered_SS_matrix.csv') adata.var["gene_symboles"] = adata.var.index adata.var_names_make_unique(join="-") adata.obs['barcode'] = adata.obs.index adata.obs_names_make_unique(join="-") ## load metadata meta_df = pd.read_csv(data_dir+os.sep+'AllenBrain/filtered_SS_metadata.csv', index_col=0) adata.obs = adata.obs.merge(meta_df, left_on='barcode', right_on='barcode', how='left') ## merge anndata with metadata adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None adata.obs = adata.obs[['barcode', 'donor_label', 'region_label', 'joint_region_label', 'label', 'cluster_label', 'ss_cluster_label']] if 0 == celltype_gran: adata.obs.rename(columns={'label':'cell.type'}, inplace=True) if curate: adata_obs = adata.obs adata_obs['cell.type'].replace(['L2/3 IT CTX', 'L2/3 IT RSP', 'L2/3 IT RSPv', 'L2 IT ENTl', 'L4/5 IT CTX', 'L4 RSP-ACA', 'L5/6 IT CTX', 'L5 IT CTX', 'L5 IT TPE-ENT', 'L5 NP CT CTX ', 'L5 NP CTX', 'L5 PT CTX', 'L6b CTX', 'L6 CT CTX', 'L6 IT CTX', 'CA1', 'CA1-ProS', 'CA2-IG-FC', 'CA3', 'CT SUB', 'Car3', 'DG', 'L2 IT ENTm', 'L2 IT PAR', 'L2/3 IT APr', 'L2/3 IT ENTl', 'L2/3 IT HATA', 'L2/3 IT PPP', 'L3 IT ENTl', 'L3 IT ENTm', 'L5 PPP', 'L6 IT ENTl', 'L6b/CT ENT', 'NP PPP', 'NP SUB', 'SUB', 'SUB-ProS'], 'Neuron', inplace=True) adata_obs['cell.type'].replace(['Lamp5', 'Lamp5 Lhx6', 'Pax6', 'Pvalb', 'Sncg', 'Sst', 'Sst Chodl', 'Vip'], 'Interneuron', inplace=True) adata_obs['cell.type'].replace(['Astro'], 'Astrocytes', inplace=True) adata_obs['cell.type'].replace(['Endo'], 'Endothelial', inplace=True) adata_obs['cell.type'].replace(['Micro-PVM'], 'Microglia', inplace=True) adata_obs['cell.type'].replace(['Oligo'], 'Oligodendrocytes', inplace=True) adata_obs['cell.type'].replace(['SMC-Peri'], 'Pericytes', inplace=True) adata.obs = adata_obs ## 'CR', 'Meis2', 'Meis2 HPF', 'Ndnf HPF', 'VLMC' celltype are out of all other database, therefore not included ## The polydendrocytes (OPCs) are missing remained_idx = ~adata.obs['cell.type'].isin(['CR', 'Meis2', 'Meis2 HPF', 'Ndnf HPF', 'VLMC']) adata = adata[remained_idx] if 1 == celltype_gran: ## for sub-celltypes adata.obs.rename(columns={"cluster_label": "cell.type"}, inplace=True) if ind is not None: ind_cells = adata.obs[adata.obs["donor_label"].isin(ind.split('_'))].index adata = adata[ind_cells] return adata
def _load_csv( path_to_file: str, gene_by_cell: bool = False, delimiter: str = ",", first_column_names: bool = None, ): logger.info("Loading dataset from {}".format(path_to_file)) adata = anndata.read_csv(path_to_file, delimiter=delimiter, first_column_names=first_column_names) if gene_by_cell: adata.X = adata.X.T logger.info("Finished loading dataset") return adata
def test_anndata(): try: anndata except NameError: # anndata not installed return scdata = anndata.read_csv("../data/test_data.csv") fast_magic_operator = magic.MAGIC(t='auto', a=None, k=10) sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes") assert np.all(sc_magic.var_names == scdata.var_names) assert np.all(sc_magic.obs_names == scdata.obs_names) sc_magic = fast_magic_operator.fit_transform(scdata, genes=['VIM', 'ZEB1']) assert np.all(sc_magic.var_names.values == np.array(['VIM', 'ZEB1'])) assert np.all(sc_magic.obs_names == scdata.obs_names)
def test_anndata(): try: anndata except NameError: # anndata not installed return scdata = anndata.read_csv(data_path) fast_magic_operator = magic.MAGIC( t="auto", solver="approximate", decay=None, knn=10, verbose=False ) sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes") assert np.all(sc_magic.var_names == scdata.var_names) assert np.all(sc_magic.obs_names == scdata.obs_names) sc_magic = fast_magic_operator.fit_transform(scdata, genes=["VIM", "ZEB1"]) assert np.all(sc_magic.var_names.values == np.array(["VIM", "ZEB1"])) assert np.all(sc_magic.obs_names == scdata.obs_names)
def load_file(filepath): t_flag = False if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") t_flag = True elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': adata = anndata.read_csv(filepath) if t_flag: adata = adata.T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") # Make sure cluster names are in proper format if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) adata.uns['dataset'] = dataset return adata
def ReadData(feature, phen): RawData = ad.read_csv(feature, delimiter='\t', first_column_names=True) phen = pd.read_csv(phen, sep='\t', header=0, index_col=0) phen.index = [str(i) for i in phen.index] RawData.obs['phen'] = phen.loc[RawData.obs.index, :] return RawData
def read_one_chunk(chunk_index): adata = ad.read_csv(csv_file) return read_adata_chunk(adata, chunk_size, chunk_index)
def _read( filename: Path, backed=None, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, cache_compression=None, suppress_cache_warning=False, **kwargs, ): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' f'{avail_exts}') else: ext = is_valid_filename(filename, return_ext=True) is_present = _check_datafile_present_and_download( filename, backup_url=backup_url, ) if not is_present: logg.debug(f'... did not find original file {filename}') # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.debug(f'reading sheet {sheet} from file {filename}') return read_hdf(filename, sheet) # read other file types path_cache = settings.cachedir / _slugify(filename).replace( '.' + ext, '.h5ad') # type: Path if path_cache.suffix in {'.gz', '.bz2'}: path_cache = path_cache.with_suffix('') if cache and path_cache.is_file(): logg.info(f'... reading from cache file {path_cache}') return read_h5ad(path_cache) if not is_present: raise FileNotFoundError(f'Did not find file {filename}.') logg.debug(f'reading {filename}') if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( "Provide `sheet` parameter when reading '.xlsx' files.") else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.hint( "... assuming '.data' means tab or white-space " 'separated text file', ) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError(f'Unknown extension {ext}.') if cache: logg.info(f'... writing an {settings.file_format_data} ' 'cache file to speedup reading next time') if cache_compression is _empty: cache_compression = settings.cache_compression if not path_cache.parent.is_dir(): path_cache.parent.mkdir(parents=True) # write for faster reading when calling the next time adata.write(path_cache, compression=cache_compression) return adata
def get_clusters_et_al(path, size=5, filter_ncounts=False, filter_mito=False, reload_file=False): f = path ext = os.path.splitext(f)[-1] if not reload_file and (ext == '.h5ad' or os.path.exists(path.replace(ext, '.h5ad'))): results_file = path.replace(ext, '.h5ad') adata = ad.read_h5ad(results_file) elif ext == '.csv' or os.path.exists(path.replace(ext, '.csv')): results_file = path.replace(ext, '.h5ad') path = path.replace(ext, '.csv') adata = ad.read_csv(path).transpose() sc.pp.filter_genes(adata, min_cells=3) sc.pp.filter_cells(adata, min_genes=200) if filter_ncounts: if isinstance(filter_ncounts, bool): adata.obs['n_counts'] = adata.X.sum(axis=1) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) ax1.hist(adata.obs.n_genes, bins=100, range=(0, np.percentile(adata.obs.n_genes, 99))) ax1.set_xlabel('Number of counts') ax1.set_ylabel('Number of cells') ax2.hist(adata.obs.n_genes, bins=100, cumulative=True, density=True, range=(0, np.percentile(adata.obs.n_genes, 99))) ax2.set_xlabel('Number of counts') ax2.set_ylabel('Ratio of cells') ax2.grid(True, axis='both') fig.tight_layout() plt.show() th_ncount = input( 'Please enter the threshold value for the maximum number of counts: ' ) th_ncount = get_threshold(th_ncount, adata.obs.n_genes) while not is_number(th_ncount): th_ncount = input('Please enter a numeric value: ') th_ncount = float(th_ncount) else: th_ncount = filter_ncounts fig, ax = plt.subplots(1, 1) filter_tab_ncounts = adata.obs.n_genes < th_ncount ax.hist([ adata.obs.n_genes[filter_tab_ncounts], adata.obs.n_genes[filter_tab_ncounts == False] ], color=['k', 'r'], label=['kept', 'removed'], bins=100, histtype='barstacked', range=(0, np.percentile(adata.obs.n_genes, 99))) ax.set_xlabel('Number of counts') ax.set_ylabel('Number of cells') ax.legend() plt.show() else: filter_tab_ncounts = np.ones(adata.shape[0], dtype=bool) if filter_mito: if isinstance(filter_mito, bool): mito_genes = (adata.var_names.str.startswith('mt-') | adata.var_names.str.startswith('Mt-') | adata.var_names.str.startswith('MT-')) adata.obs['percent_mito'] = np.sum( adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) ax1.hist(adata.obs.percent_mito, bins=100, range=(0, np.percentile(adata.obs.percent_mito, 99))) ax1.set_xlabel('Percent of mito expression') ax1.set_ylabel('Number of cells') ax2.hist(adata.obs.percent_mito, bins=100, cumulative=True, density=True, range=(0, np.percentile(adata.obs.percent_mito, 99))) ax2.set_xlabel('Percent of mito expression') ax2.set_ylabel('Ratio of cells') ax2.grid(True, axis='both') fig.tight_layout() plt.show() th_mito = input( 'Please enter the threshold value for the maximum percent of mito expression: ' ) th_mito = get_threshold(th_mito, adata.obs.percent_mito) while not is_number(th_mito): th_mito = input('Please enter a numeric value: ') th_mito = float(th_mito) else: th_mito = filter_mito plt.close(fig) filter_tab_mito = adata.obs.percent_mito < th_mito fig, ax = plt.subplots(1, 1) ax.hist([ adata.obs.percent_mito[filter_tab_mito], adata.obs.percent_mito[filter_tab_mito == False] ], color=['k', 'r'], label=['kept', 'removed'], bins=100, histtype='barstacked', range=(0, np.percentile(adata.obs.percent_mito, 99))) ax.set_xlabel('Number of counts') ax.set_ylabel('Number of cells') ax.legend() plt.show() else: filter_tab_mito = np.ones(adata.shape[0], dtype=bool) final_filt = np.ones(adata.shape[0], dtype=bool) if filter_ncounts: final_filt[filter_tab_ncounts == False] = False if filter_mito: final_filt[filter_tab_mito == False] = False both = np.sum((filter_tab_ncounts.astype(int) + filter_tab_mito) == 0) diff = filter_tab_ncounts.astype(int) - filter_tab_mito nc = np.sum(diff == -1) mito = np.sum(diff == 1) pie_values = [np.sum(final_filt), nc, both, mito] fig, ax = plt.subplots() ax.pie(pie_values, labels=['Kept', 'ncounts', 'both', 'mito'], shadow=False, startangle=90) ax.axis('equal') adata = adata[final_filt, :] sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) adata.raw = adata adata = adata[:, adata.var.highly_variable] sc.pp.scale(adata, max_value=10) sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) sc.tl.umap(adata) sc.tl.leiden(adata, .25) adata.write(results_file) else: print('Can only work with .csv or .h5ad files (you gave {})'.format( path)) return # sc.pl.highly_variable_genes(adata.raw) sc.pl.pca(adata, color=['Bmp2', 'Sox9', 'Sox17'], size=size) sc.pl.pca_variance_ratio(adata, log=True) fig = sc.pl.umap(adata, color=['Bmp2', 'Sox9', 'Wnt3'], size=size, show=False, return_fig=True) fig.set_figwidth(20) fig.set_figheight(6) c_name = os.path.splitext( os.path.split('data/' + results_file)[-1])[0] + '.0.25' fig = sc.pl.umap(adata, color=['leiden', 'Bmp2', 'Sox9'], size=size, show=False, return_fig=True) fig.set_figwidth(20) fig.set_figheight(5) fig.savefig('figures/Umap.{:s}.pdf'.format(c_name)) sc.tl.rank_genes_groups(adata, 'leiden', method='t-test') sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, show=True, return_fig=True, save='.t-test.{:s}.pdf'.format(c_name)) sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, show=True, return_fig=True, save='.wilcoxon.{:s}.pdf'.format(c_name)) return adata
def load_FC_adata(data_dir, devstage=None, ind=None, treatment=None, celltype_gran=0, curate=False): '''Loading data from mouse frontal cortex along with metadata information @devstage: P21 or Adult @ind: individual sample prediction: P21Sample1-3, PFCSample1-12 @treatment: Saline or Cocaine @celltype_gran: major cell types or sub-cell types ''' ## load data; set genes and cells adata = anndata.read_csv( data_dir + os.sep + "MouseFC_GSE124952/GSE124952_expression_matrix.csv").T adata.var["gene_symbols"] = adata.var.index adata.var_names_make_unique(join="-") adata.obs['barcode'] = adata.obs.index adata.obs_names_make_unique(join="-") ## load metadata meta_df = pd.read_csv(data_dir + os.sep + "MouseFC_GSE124952/GSE124952_meta_data.csv", index_col=0) adata.obs = adata.obs.merge(meta_df, left_index=True, right_index=True) if 0 == celltype_gran: adata.obs.rename(columns={"CellType": "cell.type"}, inplace=True) ## change cell type names if curate: ## if curation is needed for cross dataset prediction adata_obs = adata.obs #adata_obs["cell.type"].replace(['Oligo', 'NF Oligo'], 'Oligodendrocytes', inplace=True) #adata_obs["cell.type"].replace(['OPC'], 'Polydendrocytes', inplace=True) adata_obs["cell.type"].replace(['Oligo'], 'Oligodendrocytes', inplace=True) adata_obs["cell.type"].replace(['OPC', 'NF Oligo'], 'Polydendrocytes', inplace=True) adata_obs["cell.type"].replace(['Astro'], 'Astrocytes', inplace=True) adata_obs["cell.type"].replace(['Excitatory'], 'Neuron', inplace=True) adata_obs["cell.type"].replace(['Inhibitory'], 'Interneuron', inplace=True) adata_obs["cell.type"].replace(['Endo'], 'Endothelial', inplace=True) ## Microglia stays as microglia adata.obs = adata_obs elif 1 == celltype_gran: adata.obs.rename(columns={"L2_clusters": "cell.type"}, inplace=True) ## subset individuals if ind is not None: ind_cells = adata.obs[adata.obs['Sample'].isin(ind.split('_'))].index adata = adata[ind_cells] if devstage is not None: dev_cells = adata.obs[adata.obs["DevStage"] == devstage].index adata = adata[dev_cells] if treatment is not None: treat_cells = adata.obs[adata.obs["treatment"] == treatment].index adata = adata[treat_cells] return adata
def test_read_csv(): adata = ad.read_csv(HERE / 'adata.csv') assert adata.obs_names.tolist() == ['r1', 'r2', 'r3'] assert adata.var_names.tolist() == ['c1', 'c2'] assert adata.X.tolist() == X_list
import sys sys.path.append(".") # Adds higher directory to python modules path. import anndata import pycogaps import scipy.io import scipy.sparse import numpy as np from PyCoGAPS import * # placeholder until we have anndata samples # maybe also read files into an anndata object? path = './data/GIST.csv' prm = pycogaps.GapsParameters(path) adata = anndata.read_csv(path) adataX = adata.X if scipy.sparse.issparse(adataX): adataX = adataX.toarray() # create Matrix object from anndata X matrix = pycogaps.Matrix(adataX) result = pycogaps.runCogapsFromMatrix(matrix, prm) # convert Amean and Pmean results to numpy arrays Amean = toNumpy(result.Amean) Pmean = toNumpy(result.Pmean) # anndata labels
def test_read_csv(): adata = ad.read_csv(HERE / "adata.csv") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list
def _load_input_file(path): if path[-5:] == '.h5ad': adata = anndata.read_h5ad(path) elif path[-4:] == '.csv': adata = anndata.read_csv(path) return adata