def test_readloom_deprecations(tmp_path): loom_pth = tmp_path / "test.loom" adata_src = gen_adata((5, 10), obsm_types=[np.ndarray], varm_types=[np.ndarray]) adata_src.write_loom(loom_pth, write_obsm_varm=True) # obsm_names -> obsm_mapping obsm_mapping = {"df": adata_src.obs.columns} with pytest.warns(FutureWarning): depr_result = ad.read_loom(loom_pth, obsm_names=obsm_mapping) actual_result = ad.read_loom(loom_pth, obsm_mapping=obsm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match="ambiguous"): ad.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) # varm_names -> varm_mapping varm_mapping = {"df": adata_src.var.columns} with pytest.warns(FutureWarning): depr_result = ad.read_loom(loom_pth, varm_names=varm_mapping) actual_result = ad.read_loom(loom_pth, varm_mapping=varm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match="ambiguous"): ad.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) # positional -> keyword with pytest.warns(FutureWarning, match="sparse"): depr_result = ad.read_loom(loom_pth, True) actual_result = ad.read_loom(loom_pth, sparse=True) assert type(depr_result.X) == type(actual_result.X)
def loom_to_csv(args) -> None: u""" Convert velocyte loom file to csv :param loom_path: :param output: :return: """ loom_path, output = args if not os.path.exists(output): os.makedirs(output) logger.info("Loading from {0}".format(loom_path)) data = anndata.read_loom(os.path.abspath(loom_path)) logger.info("Feature of {0}".format(os.path.basename(loom_path))) data.var.to_csv(os.path.join(output, "var.csv.gz")) logger.info("Barcode of {0}".format(os.path.basename(loom_path))) data.obs.to_csv(os.path.join(output, "obs.csv.gz")) for i in ["matrix", "ambiguous", "spliced", "unspliced"]: logger.info("{1} of {0}".format(os.path.basename(loom_path), i)) temp = pd.DataFrame(data.layers[i].todense()) temp.columns = data.var.index temp["index"] = data.obs.index temp = temp.melt(id_vars="index") temp = temp.loc[temp["value"] > 0, :] temp.to_csv(os.path.join(output, "{0}.csv.gz".format(i)))
def test_readwrite_loom(typ, obsm_names, varm_names, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.obsm["X_a"] = np.zeros((adata_src.n_obs, 2)) adata_src.varm["X_b"] = np.zeros((adata_src.n_vars, 3)) adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) adata = ad.read_loom( tmp_path / "test.loom", sparse=typ is csr_matrix, obsm_names=obsm_names, varm_names=varm_names, cleanup=True, ) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray()) assert "X_a" in adata.obsm_keys() and adata.obsm["X_a"].shape[1] == 2 assert "X_b" in adata.varm_keys() and adata.varm["X_b"].shape[1] == 3 # as we called with `cleanup=True` assert "oanno1b" in adata.uns["loom-obs"] assert "vanno2" in adata.uns["loom-var"] for k, v in obsm_names.items(): assert k in adata.obsm_keys() and adata.obsm[k].shape[1] == len(v) for k, v in varm_names.items(): assert k in adata.varm_keys() and adata.varm[k].shape[1] == len(v)
def run(args): """Compile an AnnData object from a loom file """ # Parse options... options = args # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error # take options h5_fname = options.ifile out_fname = options.ofile # read adata = ad.read_loom(h5_fname, sparse=True) # add n_counts and n_kmers to adata.obs adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1 adata.obs['n_kmers'] = np.sum(adata.X > 0, axis=1).A1 # add n_cells to adata.var adata.var['n_cells'] = np.sum(adata.X > 0, axis=0).A1 # save adata to h5ad adata.write_h5ad(filename=out_fname) return
def read_adata(path, spatial_directory=None, use_raw=False): if path.lower().endswith(".loom"): adata = anndata.read_loom(path) elif path.lower().endswith(".zarr"): adata = anndata.read_zarr(path) else: adata = anndata.read(path) if "module" in adata.uns: adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData( X=adata.uns["module"]["X"], var=adata.uns["module"]["var"] ) if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[0]: logger.info("Using adata.raw") adata = anndata.AnnData( X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns ) if spatial_directory is not None: if not add_spatial(adata, spatial_directory): logger.info("No spatial data found in {}".format(spatial_directory)) for field in categorical_fields_convert: if field in adata.obs and not pd.api.types.is_categorical_dtype(adata.obs[field]): logger.info("Converting {} to categorical".format(field)) adata.obs[field] = adata.obs[field].astype(str).astype("category") return adata
def read_adata(self, path): path_lc = path.lower() if path_lc.endswith('.loom'): return anndata.read_loom(path) elif path_lc.endswith('.zarr'): return anndata.read_zarr(path) elif path_lc.endswith('.tsv'): return read_star_fusion_file(path) elif path_lc.endswith('.rds'): # Seurat, convert to h5ad h5_file = path + '.h5ad' import os if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001: import subprocess import pkg_resources import shutil print('Converting Seurat object') if os.path.exists(h5_file): os.remove(h5_file) subprocess.check_call( ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file]) shutil.copystat(path, h5_file) adata = anndata.read(h5_file, backed=self.backed) if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]: print('Using adata.raw') adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns) return adata return anndata.read(path, backed=self.backed)
def read_adata(path, spatial_directory=None, use_raw=False): if path.lower().endswith('.loom'): adata = anndata.read_loom(path) elif path.lower().endswith('.zarr'): adata = anndata.read_zarr(path) else: adata = anndata.read(path) if 'module' in adata.uns: adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData( X=adata.uns['module']['X'], var=adata.uns['module']['var']) if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[ 0]: logger.info('Using adata.raw') adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns) if spatial_directory is not None: if not add_spatial(adata, spatial_directory): logger.info( 'No spatial data found in {}'.format(spatial_directory)) for field in categorical_fields_convert: if field in adata.obs and not pd.api.types.is_categorical_dtype( adata.obs[field]): logger.info('Converting {} to categorical'.format(field)) adata.obs[field] = adata.obs[field].astype(str).astype('category') return adata
def importLoom(inFname): " load a loom file with anndata and fix up the obsm attributes " import pandas as pd import anndata ad = anndata.read_loom(inFname) coordKeyList = (["_tSNE1", "_tSNE2"], ["_X", "_Y"], ["UMAP1", "UMAP2"], ['Main_cluster_umap_1', 'Main_cluster_umap_2']) obsKeys = getObsKeys(ad) foundCoords = False for coordKeys in coordKeyList: if coordKeys[0] in obsKeys and coordKeys[1] in obsKeys: logging.debug( "Found %s in anndata.obs, moving these fields into obsm" % repr(coordKeys)) newObj = pd.concat([ad.obs[coordKeys[0]], ad.obs[coordKeys[1]]], axis=1) ad.obsm["tsne"] = newObj del ad.obs[coordKeys[0]] del ad.obs[coordKeys[1]] foundCoords = True break if not foundCoords: logging.warn( "Did not find any keys like %s in anndata.obs, cannot import coordinates" % repr(coordKeyList)) return ad
def load_file(filepath): if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': # TODO remove transpose adata = anndata.read_csv(filepath).T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[ -4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") adata.uns['dataset'] = dataset return adata
def read_adata(self, filesystem, path): path_lc = path.lower() path_lc = path_lc.rstrip('/') if path_lc.endswith('.loom'): adata = anndata.read_loom(filesystem.open(path)) elif path_lc.endswith('.zarr'): adata = anndata.read_zarr(filesystem.get_mapper(path)) elif path_lc.endswith('.tsv'): adata = read_star_fusion_file(filesystem.open(path)) elif path_lc.endswith('.rds'): # Seurat, convert to h5ad h5_file = path + '.h5ad' import os if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001: import subprocess import pkg_resources import shutil print('Converting Seurat object') if os.path.exists(h5_file): os.remove(h5_file) subprocess.check_call( ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file]) shutil.copystat(path, h5_file) adata = anndata.read_h5ad(h5_file, backed='r' if self.backed else None) if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]: print('Using adata.raw') adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns) else: adata = anndata.read_h5ad(filesystem.open(path), backed='r' if self.backed else None) if 'module' in adata.uns: adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(X=adata.uns['module']['X'], var=adata.uns['module']['var']) return adata
def get_adata(url, filename=None): """Download example data to local folder. Parameters ---------- url: filename Returns ------- adata: :class:`~anndata.AnnData` an Annodata object. """ filename = ntpath.basename(url) if filename is None else filename filename = "./data/" + filename if not os.path.exists(filename): if not os.path.exists("./data/"): os.mkdir("data") urlretrieve(url, filename) # download the data if Path(filename).suffixes[-1][1:] == "loom": adata = read_loom(filename=filename) elif Path(filename).suffixes[-1][1:] == "h5ad": adata = read_h5ad(filename=filename) adata.var_names_make_unique() return adata
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords): """ Read h5ad, loom, mtx, 10X h5, and csv formatted files Parameters ---------- path: str File name of data file. obs: {str, pd.DataFrame} Path to obs data file or a data frame var: {str, pd.DataFrame} Path to var data file or a data frame obs_filter {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids var_filter: {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids Returns ------- Annotated data matrix. """ _, ext = os.path.splitext(str(path).lower()) if ext == '.txt': df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0) adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns)) elif ext == '.h5ad': adata = anndata.read(path) elif ext == '.loom': adata = anndata.read_loom(path) elif ext == '.mtx': adata = anndata.read_mtx(path) elif ext == '.zarr': adata = anndata.read_zarr(path) else: raise ValueError('Unknown file format: {}'.format(ext)) def get_df(meta): if not isinstance(meta, pd.DataFrame): tmp_path = None if meta.startswith('gs://'): tmp_path = download_gs_url(meta) meta = tmp_path meta = pd.read_csv(meta, sep=None, index_col='id', engine='python') if tmp_path is not None: os.remove(tmp_path) return meta if obs is not None: if not isinstance(obs, list) and not isinstance(obs, tuple): obs = [obs] for item in obs: adata.obs = adata.obs.join(get_df(item)) if var is not None: if not isinstance(var, list) and not isinstance(var, tuple): var = [var] for item in var: adata.var = adata.var.join(get_df(item)) return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
def test_readwrite_loom(tmp_path): loom_path = tmp_path / "test.loom" adata = AnnData(X=X, layers=dict(L=L.copy())) adata.write_loom(loom_path) adata_read = read_loom(loom_path, X_name="") assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers["L"] == adata_read.layers["L"]).all()
def test_readwrite_loom(tmp_path): loom_path = Path(tmp_path / 'test.loom') adata = ad.AnnData(X=X, layers={'L': L.copy()}) adata.write_loom(loom_path) adata_read = ad.read_loom(loom_path, X_name='') assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers['L'] == adata_read.layers['L']).all()
def run_Seurat(datasets, task, task_adata, method_name, log_dir, args): method_key = '{}_aligned'.format(method_name) with tempfile.TemporaryDirectory() as tmp_dir: working_dir = Path(tmp_dir) print("saving data for Seurat") #task_adata.write('_tmp_adata_for_seurat.h5ad') if args.input_space == 'PCA': df = pd.DataFrame(task_adata.obsm['PCA'], index=task_adata.obs.index) else: df = task_adata.to_df() print(df.shape) #print(df.index) #print(df.columns) count_file = working_dir / '_tmp_counts.csv' df.T.to_csv(count_file) metadata_file = working_dir / '_tmp_meta.csv' task_adata.obs.to_csv(metadata_file) loom_result_file = working_dir / '_tmp_adata_for_seurat.loom' # Run seurat #cmd = "C:\\Users\\samir\\Anaconda3\\envs\\seuratV3\\Scripts\\Rscript.exe seurat_align.R {}".format(task.batch_key) seurat_env_path = Path(args.seurat_env_path) if platform.system() == 'Windows': bin_path = seurat_env_path / 'Library' / 'mingw-w64' / 'bin' rscript_path = seurat_env_path / 'Scripts' / 'Rscript.exe' cmd = 'set PATH={};%PATH% && {} seurat_align.R {} {} {} {} {}'.format(bin_path, rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file) cmd = cmd.split() else: bin_path = seurat_env_path / 'bin' rscript_path = bin_path / 'Rscript' cmd = 'PATH="{}:$PATH" {} seurat_align.R {} {} {} {} {}'.format(bin_path, rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file) #cmd = '{} seurat_align.R {} {} {} {} {}'.format(rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file) #cmd = r"set PATH=C:\Users\samir\Anaconda3\envs\seuratV3\Library\mingw-w64\bin;%PATH% && C:\Users\samir\Anaconda3\envs\seuratV3\Scripts\Rscript.exe seurat_align.R {}".format(task.batch_key) print('Running command: {}'.format(cmd)) try: t0 = datetime.datetime.now() console_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) t1 = datetime.datetime.now() time_str = pretty_tdelta(t1 - t0) print(f'took: {time_str}') with open(log_dir / 'fit_time.txt', 'w') as f: f.write(time_str + '\n') console_output = console_output.stdout.decode('UTF-8') print('Finished running') print(console_output) aligned_adata = anndata.read_loom(loom_result_file) print('done loading loom') print(aligned_adata.shape) #print(type(aligned_adata.X)) # print(aligned_adata.obs.columns) # print(aligned_adata.obsm.keys()) # print('todense...') task_adata.obsm[method_key] = aligned_adata.X.todense() # print(task_adata.obsm[method_key][:5, :]) except subprocess.CalledProcessError as e: print("RUNNING SEURAT FAILED") print(e.stdout.decode('UTF-8'))
def test_readwrite_loom(): adata = ad.AnnData(X=X, layers={'L': L.copy()}) adata.write_loom('test.loom') adata_read = ad.read_loom('test.loom', X_name='') assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers['L'] == adata_read.layers['L']).all() os.remove('test.loom')
def test_readwrite_loom(typ, tmp_path): X = typ(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata.write_loom(tmp_path / 'test.loom') adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray())
def test_readwrite_loom(): for i, typ in enumerate([np.array, csr_matrix]): X = typ(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata.write_loom('./test.loom') adata = ad.read_loom('./test.loom', sparse=(i == 1)) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray())
def test_kazu_new(): dir_path = '../data/MCF10A_exp1/' exp1_cbc_gbc_frame = pd.read_csv(os.path.join(dir_path, 'CBC_GBC_summary.txt'), delimiter='\t') exp2_cbc_gbc_frame = pd.read_csv(os.path.join(dir_path, 'CBC_GBC.txt'), delimiter='\t') kazu_onedrive_exp1_cbc2gbc_mapping = get_mapping(exp1_cbc_gbc_frame) kazu_onedrive_exp1_cbcs = list(kazu_onedrive_exp1_cbc2gbc_mapping.keys()) kazu_onedrive_exp2_cbc2gbc_mapping = get_mapping(exp2_cbc_gbc_frame) kazu_onedrive_exp2_cbcs = list(kazu_onedrive_exp2_cbc2gbc_mapping.keys()) ann_obj = anndata.read_loom( os.path.join(dir_path, 'possorted_genome_bam_RIG79.loom')) adata_cbc_codes = [x[x.find(':') + 1:] for x in list(ann_obj.obs_names)] # ann_obj = anndata.read_h5ad(os.path.join(dir_path, 'adata.h5ad')) # adata_cbc_codes = list(ann_obj.obs_names) kazu_onedrive_exp1_barcodes = pd.read_csv(os.path.join( dir_path, 'barcodes.tsv'), header=None, delimiter='\t')[0] kazu_onedrive_exp1_barcodes = [s[:-2] for s in kazu_onedrive_exp1_barcodes] whitelist_barcodes = pd.read_csv(os.path.join(dir_path, '10xv2_whitelist.txt'), header=None, delimiter='\t')[0] print('sample outputs:') print(sorted(list(adata_cbc_codes))[:10]) print(sorted(kazu_onedrive_exp1_cbcs)[:10]) print(sorted(kazu_onedrive_exp1_barcodes)[:10]) print('total obs (cbc) in ann data:', len(adata_cbc_codes)) print('len of barcode onedrive file:', len(kazu_onedrive_exp1_barcodes)) print('len of mapping onedrive file:', len(kazu_onedrive_exp1_cbcs)) print('len of whitelist:', len(whitelist_barcodes)) print('matched #cell barcodes between adata and kazu onedrive exp1:', len(set(adata_cbc_codes) & set(kazu_onedrive_exp1_cbcs))) print('matched #cell barcodes between adata and kazu onedrive exp2:', len(set(adata_cbc_codes) & set(kazu_onedrive_exp2_cbcs))) print('matched #cell barcodes between kazu onedrive barcodes and adata:', len(set(adata_cbc_codes) & set(kazu_onedrive_exp1_barcodes))) print( 'matched #cell barcodes between kazu onedrive barcodes and kazu onedrive CBC_GBC mapping:', len(set(kazu_onedrive_exp1_cbcs) & set(kazu_onedrive_exp1_barcodes))) print('matched #cell barcodes between whitelist barcodes and annData:', len(set(whitelist_barcodes) & set(adata_cbc_codes))) print( 'matched #cell barcodes between whitelist barcodes and kazu onedrive barcodes:', len(set(whitelist_barcodes) & set(kazu_onedrive_exp1_barcodes))) print( 'matched #cell barcodes between whitelist barcodes and kazu onedrive CBC_GBC mapping:', len(set(kazu_onedrive_exp1_cbcs) & set(whitelist_barcodes)))
def read_adata(self, filesystem, path): path_lc = path.lower() path_lc = path_lc.rstrip("/") if path_lc.endswith(".loom"): adata = anndata.read_loom(filesystem.open(path)) elif path_lc.endswith(".zarr"): adata = anndata.read_zarr(filesystem.get_mapper(path)) elif path_lc.endswith(".tsv"): adata = read_star_fusion_file(filesystem.open(path)) elif path_lc.endswith(".rds"): # Seurat, convert to h5ad h5_file = path + ".h5ad" import os if (not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001): import shutil import subprocess import pkg_resources print("Converting Seurat object") if os.path.exists(h5_file): os.remove(h5_file) subprocess.check_call([ "Rscript", pkg_resources.resource_filename("cirrocumulus", "seurat2h5ad.R"), path, h5_file, ]) shutil.copystat(path, h5_file) adata = anndata.read_h5ad(h5_file, backed="r" if self.backed else None) if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]: print("Using adata.raw") adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns) else: if self.backed: adata = anndata.read_h5ad(path, backed="r") else: adata = anndata.read_h5ad(filesystem.open(path)) if "module" in adata.uns: adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData( X=adata.uns["module"]["X"], var=adata.uns["module"]["var"]) return adata
def test_readwrite_loom(typ, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.obsm['X_a'] = np.zeros((adata_src.n_obs, 2)) adata_src.varm['X_b'] = np.zeros((adata_src.n_vars, 3)) adata_src.write_loom(tmp_path / 'test.loom', write_obsm_varm=True) adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray()) assert 'X_a' in adata.obsm_keys() and adata.obsm['X_a'].shape[1] == 2 assert 'X_b' in adata.varm_keys() and adata.varm['X_b'].shape[1] == 3
def load_file(filepath): t_flag = False if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") t_flag = True elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': adata = anndata.read_csv(filepath) if t_flag: adata = adata.T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") # Make sure cluster names are in proper format if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) adata.uns['dataset'] = dataset return adata
def fetch_anndata(path, from_gcs): """Reads the input data and turns it into an anndata.AnnData object.""" _, ext = os.path.splitext(path) # AnnData is based of HDF5 and doesn't have GCS file handlers # so we have to locally copy the file before reading it. if from_gcs: with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_path = tmp_file.name tf.io.gfile.copy(path, tmp_path, overwrite=True) path = tmp_path if ext == '.h5ad': adata = anndata.read_h5ad(path) elif ext == '.loom': adata = anndata.read_loom(path) else: raise app.UsageError('Only supports loom and h5ad files.') return adata
def get_data(dataset): """Download scycle example dataset Parameters ---------- dataset: str dataset is a string with the name of the dataset to be downloaded. Must be one of: 'CHLA9' or 'sc200_CCLE' """ #-- Get cache location cache_dir = os.path.dirname(os.path.realpath(__file__)) #-- Check if cached, download otherwise #------ CHLA9 if dataset == 'CHLA9': fname = cache_dir + '/chla9.h5ad' if 'chla9.h5ad' not in os.listdir(cache_dir): print('-- Downloading CHLA9 data from Xfer...') _download_scdata(url_chla9, fname) print('-- Download concluded.') elif dataset == 'sc200_CCLE': fname = cache_dir + '/sc200_ccle.h5ad' if 'sc200_ccle.h5ad' not in os.listdir(cache_dir): print('-- Downloading sc200_ccle data from Xfer...') _download_scdata(url_sc200, fname) print('-- Download concluded.') else: print("Dataset not in list of supported datasets. Must be one of:" + ', '.join(datasets)) return None #-- Load from cache print('-- Loading data from cache...') if len(re.findall('loom$', fname)) > 0: scdata = anndata.read_loom(fname) scdata.var_names_make_unique() print('Done.') elif len(re.findall('h5ad$', fname)) > 0: scdata = anndata.read_h5ad(fname) return scdata
def read_adata(path, backed=False, spatial_directory=None, use_raw=False): import anndata adata = anndata.read_loom(path) if path.lower().endswith( '.loom') else anndata.read(path, backed=backed) if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[ 0]: logger.info('Using adata.raw') adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns) if spatial_directory is not None: if not add_spatial(adata, spatial_directory): print('No spatial data found in {}'.format(spatial_directory)) if not backed: if scipy.sparse.issparse(adata.X) and scipy.sparse.isspmatrix_csr( adata.X): adata.X = adata.X.tocsc() def fix_column_names(df): rename = {} for c in df.columns: if c.find(' ') != -1: rename[c] = c.replace(' ', '_') return df.rename(rename, axis=1) if len(rename) > 0 else df adata.obs = fix_column_names(adata.obs) adata.var = fix_column_names(adata.var) for field in categorical_fields_convert: if field in adata.obs and not pd.api.types.is_categorical_dtype( adata.obs[field]): logger.info('Converting {} to categorical'.format(field)) adata.obs[field] = adata.obs[field].astype('category') for key in adata.obsm: if key.find(' ') != -1: new_key = key.replace(' ', '_') adata.obsm[new_key] = adata.obsm[key] del adata.obsm[key] return adata
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, suppress_cache_warning=False, **kwargs): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' + avail_exts) else: ext = is_valid_filename(filename, return_ext=True) is_present = check_datafile_present_and_download(filename, backup_url=backup_url) if not is_present: logg.msg('... did not find original file', filename) # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.msg('reading sheet', sheet, 'from file', filename, v=4) return read_hdf(filename, sheet) # read other file types filename_cache = ( settings.cachedir + filename.lstrip('./').replace('/', '-').replace('.' + ext, '.h5ad')) if filename_cache.endswith('.gz'): filename_cache = filename_cache[:-3] if filename_cache.endswith('.bz2'): filename_cache = filename_cache[:-4] if cache and os.path.exists(filename_cache): logg.info('... reading from cache file', filename_cache) adata = read_h5ad(filename_cache, backed=False) else: if not is_present: raise FileNotFoundError('Did not find file {}.'.format(filename)) logg.msg('reading', filename, v=4) if not cache and not suppress_cache_warning: logg.hint( 'This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( 'Provide `sheet` parameter when reading \'.xlsx\' files.') else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.msg( '... assuming \'.data\' means tab or white-space ' 'separated text file', v=3) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError('Unkown extension {}.'.format(ext)) if cache: logg.info('... writing an', settings.file_format_data, 'cache file to speedup reading next time') if not os.path.exists(os.path.dirname(filename_cache)): os.makedirs(os.path.dirname(filename_cache)) # write for faster reading when calling the next time adata.write(filename_cache) return adata
import numpy as np import pandas as pd import anndata import loompy import matplotlib.pyplot as plt import seaborn as sns sys.path.insert(0, '/home/fabio/university/postdoc/northstar/build/lib') import northstar if __name__ == '__main__': print('Load GBM data') fdn_gbm = '../data/GBM_data_and_metadata/' fn_loom_gbm = fdn_gbm + 'GBM_data.loom' adata_gbm = anndata.read_loom(fn_loom_gbm, sparse=False) adata_gbm.var_names = adata_gbm.var['GeneName'] adata_gbm.X = 1e6 * (adata_gbm.X.T / adata_gbm.X.sum(axis=1)).T adata_gbm.obs['CellType'] = adata_gbm.obs['Cell_type'] print('Load Darmanis atlas') af = northstar.AtlasFetcher() adata_dmnf = af.fetch_atlas('Darmanis_2015_nofetal', kind='subsample') print('Load Velmeshev autism atlas labndmarks') fdn = '../data/Autism/' fn_loom = fdn + 'subsample.loom' adata = anndata.read_loom(fn_loom, sparse=False) adata.X = adata.X * 100 adata.var_names = adata.var['GeneName'] adata.obs['CellType'] = adata.obs['cluster']
def _read( filename: Path, backed=None, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, cache_compression=None, suppress_cache_warning=False, **kwargs, ): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' f'{avail_exts}') else: ext = is_valid_filename(filename, return_ext=True) is_present = _check_datafile_present_and_download( filename, backup_url=backup_url, ) if not is_present: logg.debug(f'... did not find original file {filename}') # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.debug(f'reading sheet {sheet} from file {filename}') return read_hdf(filename, sheet) # read other file types path_cache = settings.cachedir / _slugify(filename).replace( '.' + ext, '.h5ad') # type: Path if path_cache.suffix in {'.gz', '.bz2'}: path_cache = path_cache.with_suffix('') if cache and path_cache.is_file(): logg.info(f'... reading from cache file {path_cache}') return read_h5ad(path_cache) if not is_present: raise FileNotFoundError(f'Did not find file {filename}.') logg.debug(f'reading {filename}') if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( "Provide `sheet` parameter when reading '.xlsx' files.") else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.hint( "... assuming '.data' means tab or white-space " 'separated text file', ) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError(f'Unknown extension {ext}.') if cache: logg.info(f'... writing an {settings.file_format_data} ' 'cache file to speedup reading next time') if cache_compression is _empty: cache_compression = settings.cache_compression if not path_cache.parent.is_dir(): path_cache.parent.mkdir(parents=True) # write for faster reading when calling the next time adata.write(path_cache, compression=cache_compression) return adata
import numpy as np import pandas as pd import anndata import loompy import matplotlib.pyplot as plt import seaborn as sns sys.path.insert(0, '/home/fabio/university/postdoc/northstar/build/lib') import northstar if __name__ == '__main__': print('Load autism data (subsampled)') fdn = '../data/Autism/' fn_loom = fdn + 'subsample_control.loom' adata = anndata.read_loom(fn_loom, sparse=False) adata.X = adata.X * 100 adata.var_names = adata.var['GeneName'] adata.obs['CellType'] = adata.obs['cluster'] adata.obs['CellType'].replace( { 'L2/3': 'Neuron', 'L5/6': 'Neuron', 'L4': 'Neuron', 'L5/6-CC': 'Neuron', 'IN-VIP': 'Neuron', 'IN-PV': 'Neuron', 'IN-SV2C': 'Neuron', 'IN-SST': 'Neuron', 'Neu-NRGN-I': 'Neuron', 'Neu-NRGN-II': 'Neuron',
def read_loom_to_anndata(ds_file: Path): """Reads a dataset in the loom format into the AnnData format.""" adata = anndata.read_loom(ds_file) return adata