def test_read_10x_mtx(): sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices', 'hg19_chr21'), var_names='gene_symbols', cache=True) sc.read_10x_mtx(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix'), var_names='gene_symbols', cache=True)
def create_scanpy_adata_basic(self, assay="counts", sample_key=None): adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True) # adata.var_names_make_unique() # adata.obs_names_make_unique() # sc.pp.highly_variable_genes(adata, flavor="cell_ranger", subset=True) # adata = sc.tl.pca(adata, copy=True) # adata = sc.pp.neighbors(adata, copy=True) return adata
def readData(self, countsFile=""): if countsFile == "": countsFile = self.CountsFile if countsFile == "": print("please input counts file path") return "" self.CountsFile = countsFile datapath = self.CountsFile if os.path.isdir(datapath): files = os.listdir(datapath) for i in files: if i.endswith(".gz"): print(i) target = datapath + "/*.gz" print(target) command = subprocess.Popen("gunzip " + target, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT) output = command.stdout.read() break files = os.listdir(datapath) for i in files: if i == "features.tsv": os.rename(datapath + "/features.tsv", datapath + "/genes.tsv") break files = list(os.listdir(datapath)) if ('barcodes.tsv' in files) and ('barcodes.tsv' in files) and ("genes.tsv" in files): adata = sc.read_10x_mtx(datapath, var_names='gene_symbols') self.data = adata self.preprocess() else: print("input data is not correct") return "" elif os.path.isfile(datapath): if datapath.endswith(".h5ad"): adata = sc.read(datapath) else: adata = sc.read_csv(datapath) adata = adata.T self.data = adata self.preprocess() else: print("file or dir not exists") return ""
def create_scanpy_adata(self, sce, fast_load=True, assay="counts", high_var=False, subset=None): barcodes = sce.colData["Barcode"] _transcripts = sce.rowData["hgnc_symbol"] adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True) adata.var_names_make_unique() adata.obs_names_make_unique() print(adata.X) sc.pp.highly_variable_genes(adata, flavor="cell_ranger", subset=True) transcripts = [] if subset == None: subset = _transcripts for symbol in _transcripts: if symbol not in subset: continue if symbol not in adata.var.index: symbol = symbol.replace(".", "-") if symbol not in adata.var.index: symbol = symbol.split("-") symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1]) if symbol not in adata.var.index: symbol = symbol.split(".")[0] transcripts.append(symbol) adata.barcodes = pandas.read_csv(os.path.join(self.filtered_matrices(), 'barcodes.tsv'), header=None)[0] adata = adata[:, transcripts] assert set(adata.var.index) == set( transcripts), "Issues with symbol conversion." adata = adata[barcodes, :] adata.var_names_make_unique() if high_var: var_transcripts = sc.pp.highly_variable_genes(adata, flavor="cell_ranger", inplace=False, n_top_genes=1000, n_bins=100) assert len(var_transcripts) == len(adata.var.index) var_transcripts = [ x[0] for x in zip(adata.var.index, var_transcripts) if x[1][0] == True ] adata = adata[:, var_transcripts] adata = sc.tl.pca(adata, copy=True) adata = sc.pp.neighbors(adata, copy=True) adata = sc.tl.umap(adata, copy=True) adata = sc.tl.tsne(adata, copy=True) return adata
def zheng(): """Prepare the Zheng dataset Massively parallel digital transcriptional profiling of single cells. by Zheng GX, et al. in Nature Communications. 2017. """ pbmc_68k = sc.read_10x_mtx("data/zheng/filtered_matrices_mex/hg19/") bl = pd.read_csv("data/zheng/zheng17_bulk_lables.txt", header=None) pbmc_68k.obs["bulk_labels"] = bl.values pr.read.process_clusts(pbmc_68k, "bulk_labels") sc.write("data/zheng/fresh_68k_bulk_labels.h5ad", pbmc_68k) ft = pr.performance.FoldTester(pbmc_68k) ft.makefolds(random=True) ft.savefolds("output/zheng_folds.npz")
def create_scanpy_adata(self, sce, fast_load=True, assay="counts", high_var=False, subset=None): barcodes = sce.colData["Barcode"] _transcripts = sce.rowData["hgnc_symbol"] adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True) adata.var_names_make_unique() adata.obs_names_make_unique() adata.barcodes = pandas.read_csv(os.path.join(self.filtered_matrices(), 'barcodes.tsv'), header=None)[0] adata = adata[barcodes, :] return adata
def get_genes(self, sce): _transcripts = sce.rowData["hgnc_symbol"] try: adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build) except Exception: adata = sc.read_10x_mtx(self.filtered_matrices()) transcripts = [] for symbol in transcripts: if symbol not in adata.var.index: symbol = symbol.replace(".", "-") if symbol not in adata.var.index: symbol = symbol.split("-") symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1]) if symbol not in adata.var.index: symbol = symbol.split(".")[0] transcripts.append(symbol) return _transcripts
def gene_map(self, sce, original=False): _transcripts = sce.rowData["Symbol"] try: adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build) except Exception: adata = sc.read_10x_mtx(self.filtered_matrices()) transcripts = {} for symbol in _transcripts: original = symbol if symbol not in adata.var.index: symbol = symbol.replace(".", "-") if symbol not in adata.var.index: symbol = symbol.split("-") symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1]) if symbol not in adata.var.index: symbol = symbol.split(".")[0] if original: transcripts[original] = symbol else: transcripts[symbol] = original return transcripts
def load_data(data): if isfile(data): name, extension = splitext(data) if extension == ".h5ad": adata = sc.read_h5ad(data) elif extension == ".loom": adata = sc.read_loom(data) else: raise click.FileError( data, hint="does not have a valid extension [.h5ad | .loom]") elif isdir(data): if not data.endswith(sep): data += sep adata = sc.read_10x_mtx(data) else: raise click.FileError(data, hint="not a valid file or path") if not set_obs_names == "": if set_obs_names not in adata.obs_keys(): raise click.UsageError( f"obs {set_obs_names} not found, options are: {adata.obs_keys()}" ) adata.obs_names = adata.obs[set_obs_names] if not set_var_names == "": if set_var_names not in adata.var_keys(): raise click.UsageError( f"var {set_var_names} not found, options are: {adata.var_keys()}" ) adata.var_names = adata.var[set_var_names] if make_obs_names_unique: adata.obs_names_make_unique() if make_var_names_unique: adata.var_names_make_unique() if not adata._obs.index.is_unique: click.echo("Warning: obs index is not unique") if not adata._var.index.is_unique: click.echo("Warning: var index is not unique") return adata
def getAnnData_10x_mtx(input_file): adata = sc.read_10x_mtx(input_file) return adata
print("The run time for all resolution is:", get_time() - time_start) print("After training, the information of adata is:\n", adata) return data if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( description='just for simple test train.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--use_GPU', default=True, type=bool) args = parser.parse_args() print(args) #test for pbmc adata = sc.read_10x_mtx("../datasets/pbmc", var_names="gene_symbols", cache=True) sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) mito_genes = adata.var_names.str.startswith('MT-') adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 adata.obs['n_counts'] = adata.X.sum(axis=1).A1 adata = adata[adata.obs['n_genes'] < 2500, :] adata = adata[adata.obs['percent_mito'] < 0.05, :] sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
def build(pyfit): celltypes = pickle.load(open(pyfit,"rb")) adata = sc.read_10x_mtx(tenx_path,var_names='gene_symbols') rho = get_rho(rho_path) adata =
args = parser.parse_args() # load dataset optimizer1 = Adam(amsgrad=True) optimizer2 = 'adadelta' # data_mat = h5py.File(args.data_file) # x = np.array(data_mat['X']) # y = np.array(data_mat['Y']) # # preprocessing scRNA-seq read counts matrix # adata = sc.AnnData(x) # adata.obs['Group'] = y adata = sc.read_10x_mtx(args.data_file, var_names='gene_symbols', cache=True) adata = read_dataset(adata, transpose=False, test_split=False, copy=True) adata = normalize(adata, size_factors=True, normalize_input=True, logtrans_input=True) y = None input_size = adata.n_vars print("X:", type(adata.X), adata.X.shape) # print("y:", type(adata.X), adata.X.shape) # print(y.shape) x_sd = adata.X.std(0)
import scvelo as scv scv.settings.set_figure_params('scvelo') import scanpy.api as sc sc.settings.autoshow=False sc.settings.autosave=True sc.settings.figdir='/scrapp2/mtschmitz/data/Exonic/fig' adata = sc.read_10x_mtx('/scrapp2/mtschmitz/data/Exonic/E40_motor_Out/outs/filtered_gene_bc_matrices/refdata-celranger-mmul8-toplevel/', cache=True) ldata = scv.read('/scrapp2/mtschmitz/data/Exonic/E40_motor_Out_velocyto/possorted_genome_bam_RWRQ2.loom', cache=True) adata.var_names_make_unique() ldata.var_names_make_unique() adata = scv.utils.merge(adata, ldata) adata.var_names_make_unique() print('norm') scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) scv.pp.filter_genes_dispersion(adata) scv.pp.log1p(adata) print(adata) print('moment') scv.pp.moments(adata, n_pcs=30, n_neighbors=30) print('velo') scv.tl.umap(adata) scv.tl.velocity(adata) print('graph') scv.tl.velocity_graph(adata) scv.tl.velocity_embedding(adata, basis='umap') scv.pl.velocity_embedding(adata, basis='umap',save='Embed') scv.pl.velocity_embedding_grid(adata, basis='umap',save='Grid') scv.pl.velocity_embedding_stream(adata, basis='umap',save='stream') sc.tl.leiden(adata)
def cli(dataset, engine, format, layout, recipe, output, sparse, plotting): """ Hi! This is a tool for preprocessing data for use with cellxgene. """ import matplotlib matplotlib.use('Agg') import scanpy.api as sc import pandas as pd import numpy as np # scanpy settings sc.settings.verbosity = 2 sc.settings.autosave = True # data loading adata = None if format == 'h5ad': adata = sc.read_h5ad(dataset) if format == '10x_mtx': adata = sc.read_10x_mtx(dataset) if format == 'loom' and sparse: adata = sc.read_loom(dataset, sparse=True) if format == 'loom' and not sparse: adata = sc.read_loom(dataset, sparse=False) adata.var_names_make_unique() # run a recipe if requested if recipe == 'seurat': sc.pp.recipe_seurat(adata) elif recipe == 'zheng17': sc.pp.recipe_zheng17(adata) else: sc.pp.filter_cells(adata, min_genes=5) sc.pp.filter_genes(adata, min_cells=25) if sparse: sc.pp.scale(adata, zero_center=False) else: sc.pp.scale(adata) # dimensionality reduction if sparse: sc.pp.pca(adata, svd_solver='arpack', zero_center=False) else: sc.pp.pca(adata, svd_solver='arpack') # neighbors and clustering sc.pp.neighbors(adata) sc.tl.louvain(adata) # layout and plotting if len(np.unique(adata.obs['louvain'].values)) < 10: palette = 'tab10' else: palette = 'tab20' if layout == 'umap' or layout == 'umap+tsne': sc.tl.umap(adata) if plotting: sc.pl.umap(adata, color='louvain', palette=palette, save='_louvain') if layout == 'tsne' or layout == 'umap+tsne': sc.tl.tsne(adata) if plotting: sc.pl.tsne(adata, color='louvain', palette=palette, save='_louvain') # show the structure print('data structure...') print(adata) # saving file if not output == '': print('saving output...') adata.write(output)
def create_scanpy_adata_basic(self, assay="counts", sample_key=None): adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True) return adata
chdir(CWD) ####### main meta = pd.read_csv('./liver_metadata.csv',header=0) sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3) scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval'] os.makedirs('scrublet-scores') ### for sample in meta.lanes.unique(): #import data adata_sample = sc.read_10x_mtx('Liver/'+sample+'/filtered', cache=True) #rename cells to SAMPLE_BARCODE, cleaving the trailing -1 adata_sample.obs_names = [sample+'_'+i.split('-')[0] for i in adata_sample.obs_names] #set up and run Scrublet scrub = scr.Scrublet(adata_sample.X) doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False) adata_sample.obs['scrublet_score'] = doublet_scores #overcluster prep. run turbo basic scanpy pipeline sc.pp.filter_genes(adata_sample, min_cells=3) sc.pp.normalize_per_cell(adata_sample, counts_per_cell_after=1e4) sc.pp.log1p(adata_sample) sc.pp.highly_variable_genes(adata_sample, min_mean=0.0125, max_mean=3, min_disp=0.5) adata_sample = adata_sample[:, adata_sample.var['highly_variable']] sc.pp.scale(adata_sample, max_value=10) sc.tl.pca(adata_sample, svd_solver='arpack') sc.pp.neighbors(adata_sample)