def _read(filename, backed=False, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, suppress_cache_warning=False): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' + avail_exts) else: ext = is_valid_filename(filename, return_ext=True) is_present = check_datafile_present_and_download(filename, backup_url=backup_url) if not is_present: logg.msg('... did not find original file', filename) # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.msg('reading sheet', sheet, 'from file', filename, v=4) return read_hdf(filename, sheet) # read other file types filename_cache = (settings.cachedir + filename.lstrip( './').replace('/', '-').replace('.' + ext, '.h5ad')) if cache and os.path.exists(filename_cache): logg.info('... reading from cache file', filename_cache) adata = read_h5ad(filename_cache, backed=False) else: if not is_present: raise FileNotFoundError('Did not find file {}.'.format(filename)) logg.msg('reading', filename, v=4) if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( 'Provide `sheet` parameter when reading \'.xlsx\' files.') else: adata = read_excel(filename, sheet) elif ext == 'mtx': adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.msg('... assuming \'.data\' means tab or white-space ' 'separated text file', v=3) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) else: raise ValueError('Unkown extension {}.'.format(ext)) if cache: logg.info('... writing an', settings.file_format_data, 'cache file to speedup reading next time') if not os.path.exists(os.path.dirname(filename_cache)): os.makedirs(os.path.dirname(filename_cache)) # write for faster reading when calling the next time adata.write(filename_cache) return adata
def load_file(filepath): if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': # TODO remove transpose adata = anndata.read_csv(filepath).T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[ -4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") adata.uns['dataset'] = dataset return adata
def load_data(path, dtype='dge'): if dtype == 'dge': dataset = ad.read_text(path) elif dtype == '10x': dataset = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True) return dataset
def load_matrix(self): if self.raw_filename[-5:] == '.h5ad': adata = sc.read_h5ad( os.path.join(self.data_root, self.raw_filename)) elif self.raw_filename[-4:] == '.tsv': adata = ad.read_text(os.path.join(self.data_root, self.raw_filename), delimiter='\t', first_column_names=True, dtype='int') else: raise ImportError("Input format error!") return adata
def load_file(filepath): t_flag = False if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") t_flag = True elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': adata = anndata.read_csv(filepath) if t_flag: adata = adata.T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") # Make sure cluster names are in proper format if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) adata.uns['dataset'] = dataset return adata
def test_read_tsv_iter(): with (HERE / "adata-comments.tsv").open() as f: adata = ad.read_text(f, "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list
def test_read_tsv_strpath(): adata = ad.read_text(str(HERE / "adata-comments.tsv"), "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list
def _read( filename: Path, backed=None, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, cache_compression=None, suppress_cache_warning=False, **kwargs, ): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' f'{avail_exts}') else: ext = is_valid_filename(filename, return_ext=True) is_present = _check_datafile_present_and_download( filename, backup_url=backup_url, ) if not is_present: logg.debug(f'... did not find original file {filename}') # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.debug(f'reading sheet {sheet} from file {filename}') return read_hdf(filename, sheet) # read other file types path_cache = settings.cachedir / _slugify(filename).replace( '.' + ext, '.h5ad') # type: Path if path_cache.suffix in {'.gz', '.bz2'}: path_cache = path_cache.with_suffix('') if cache and path_cache.is_file(): logg.info(f'... reading from cache file {path_cache}') return read_h5ad(path_cache) if not is_present: raise FileNotFoundError(f'Did not find file {filename}.') logg.debug(f'reading {filename}') if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( "Provide `sheet` parameter when reading '.xlsx' files.") else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.hint( "... assuming '.data' means tab or white-space " 'separated text file', ) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError(f'Unknown extension {ext}.') if cache: logg.info(f'... writing an {settings.file_format_data} ' 'cache file to speedup reading next time') if cache_compression is _empty: cache_compression = settings.cache_compression if not path_cache.parent.is_dir(): path_cache.parent.mkdir(parents=True) # write for faster reading when calling the next time adata.write(path_cache, compression=cache_compression) return adata
def load_matrix(self): adata = ad.read_text(os.path.join(self.data_root, self.raw_filename), delimiter='\t', first_column_names=True, dtype='int') return adata
def test_read_tsv_iter(): with (HERE / 'adata-comments.tsv').open() as f: adata = ad.read_text(f, '\t') assert adata.obs_names.tolist() == ['r1', 'r2', 'r3'] assert adata.var_names.tolist() == ['c1', 'c2'] assert adata.X.tolist() == X_list
def test_read_tsv_strpath(): adata = ad.read_text(str(HERE / 'adata-comments.tsv'), '\t') assert adata.obs_names.tolist() == ['r1', 'r2', 'r3'] assert adata.var_names.tolist() == ['c1', 'c2'] assert adata.X.tolist() == X_list
def load_shareseq_data(tissue: str, dirname: str, mode: str = "RNA") -> AnnData: """Load the SHAREseq data""" assert os.path.isdir(dirname) atac_fname_dict = { "skin": [ "GSM4156597_skin.late.anagen.barcodes.txt.gz", "GSM4156597_skin.late.anagen.counts.txt.gz", "GSM4156597_skin.late.anagen.peaks.bed.gz", ], "brain": [ "GSM4156599_brain.barcodes.txt.gz", "GSM4156599_brain.counts.txt.gz", "GSM4156599_brain.peaks.bed.gz", ], "lung": [ "GSM4156600_lung.barcodes.txt.gz", "GSM4156600_lung.counts.txt.gz", "GSM4156600_lung.peaks.bed.gz", ], } rna_fname_dict = { "skin": "GSM4156608_skin.late.anagen.rna.counts.txt.gz", "brain": "GSM4156610_brain.rna.counts.txt.gz", "lung": "GSM4156611_lung.rna.counts.txt.gz", } assert atac_fname_dict.keys() == rna_fname_dict.keys() assert tissue in atac_fname_dict.keys(), f"Unrecognized tissue: {tissue}" atac_barcodes_fname, atac_counts_fname, atac_peaks_fname = atac_fname_dict[ tissue] assert "barcodes" in atac_barcodes_fname # Check fnames are unpacked correctly assert "counts" in atac_counts_fname assert "peaks" in atac_peaks_fname atac_cell_barcodes = pd.read_csv( os.path.join(dirname, atac_barcodes_fname), delimiter="\t", index_col=0, header=None, ) atac_cell_barcodes.index = [ i.replace(",", ".") for i in atac_cell_barcodes.index ] # Load in RNA data if mode == "RNA": retval = ad.read_text(os.path.join(dirname, rna_fname_dict[tissue])).T # Ensure that we return a sparse matrix as the underlying datatype retval.X = scipy.sparse.csr_matrix(retval.X) # Fix formatting of obs names where commas were used for periods retval.obs.index = [i.replace(",", ".") for i in retval.obs.index] intersected_barcodes = [ bc for bc in retval.obs_names if bc in set(atac_cell_barcodes.index) ] assert intersected_barcodes, f"No common barcodes between RNA/ATAC for {tissue}" logging.info( f"RNA {tissue} intersects {len(intersected_barcodes)}/{len(retval.obs_names)} barcodes with ATAC" ) retval = retval[intersected_barcodes] elif mode == "ATAC": # Load in ATAC data # read_mtx automatically gives us a sparse matrix retval = ad.read_mtx(os.path.join(dirname, atac_counts_fname)).T # Attach metadata retval.obs = atac_cell_barcodes atac_peaks = pd.read_csv( os.path.join(dirname, atac_peaks_fname), delimiter="\t", header=None, names=["chrom", "start", "end"], ) atac_peaks.index = [ f"{c}:{s}-{e}" for _i, c, s, e in atac_peaks.itertuples() ] retval.var = atac_peaks else: raise ValueError("mode must be either RNA or ATAC") assert isinstance(retval.X, scipy.sparse.csr_matrix) return retval
df_csv = pd.read_csv(file_csv, index_col=0) df_csv = df_csv.T colnames = [x.replace('-', ':', 1) for x in df_csv.columns] df_csv.columns = colnames file_count_tsv = os.path.join(path_data_root, 'counts.tsv') df_csv.to_csv(file_count_tsv, sep='\t') file_meta_csv = os.path.join(path_human_brain, 'GSM5289636_s3atac.hg38.metadata.csv') df_meta_csv = pd.read_csv(file_meta_csv, index_col=0) df_meta_csv = df_meta_csv.loc[:, ['cellID', 'celltype']] file_meta_tsv = os.path.join(path_data_root, 'metadata.tsv') df_meta_csv.to_csv(file_meta_tsv, sep='\t') adata = ad.read_text(file_count_tsv, delimiter='\t', first_column_names=True, dtype='int') df_meta = pd.read_csv(file_meta_tsv, sep='\t', index_col=0) adata.obs['celltype'] = df_meta.loc[adata.obs.index, 'celltype'] print(np.max(adata.X)) if np.max(adata.X) > 1: epi.pp.binarize(adata) print(np.max(adata.X)) epi.pp.filter_cells(adata, min_features=1) epi.pp.filter_features(adata, min_cells=1) # QC adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']] epi.pl.violin(adata, ['nb_features']) epi.pl.violin(adata, ['log_nb_features']) epi.pp.coverage_cells(adata,
# cell_type_column='cell_ontology_class', # n_cells=nct, # ) # asub2.to_df().to_csv( # '../data/for_scmap/TBS_kidney_newdata_subsample_100_counts_rep_{:}.tsv'.format(rep), # sep='\t', index=True) # asub2.obs[['CellType']].to_csv( # '../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_{:}.tsv'.format(rep), # sep='\t', index=True) print('Include different subset of cell types in atlas') nat = [14, 15, 16, 17] csts = adata.obs['cell_ontology_class'].value_counts() asub = anndata.read_text( '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts.tsv', delimiter='\t', ) asub.obs['CellType'] = pd.read_csv( '../data/for_scmap/TBS_kidney_atlas_subsample_20_metadata.tsv', sep='\t', index_col=0) for na in nat: csti = csts.index[:na] idx = asub.obs['CellType'].isin(csti).values.nonzero()[0] asubr = asub[idx] asubr.to_df().to_csv( '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts_na_{:}.tsv' .format(na), sep='\t',