def test_read_10x_h5(): sc.read_10x_h5(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'), genome='hg19_chr21') sc.read_10x_h5(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5'), genome='GRCh38_chr21')
def tsne_cluster(path, njobs): adata = sc.read_10x_h5(path) sc.pp.recipe_zheng17(adata) sc.tl.tsne(adata, n_jobs=n_jobs) sc.tl.louvain(adata) sc.pl.tsne(adata, color='louvain_groups', save=True, right_margin=2) adata.write('one_million.h5ad')
def read_raw_file(self): """ Reads the raw data file and turns it into a dense matrix, stored as an attribute. Returns ------- """ print("reading single cell data from {}".format(self.raw_file)) file_format = self.raw_file.split('.')[-1] if file_format == 'h5': andata = sc.read_10x_h5(self.raw_file) elif file_format == 'h5ad': andata = sc.read(self.raw_file) else: raise ValueError('Reading [ %s ] failed, the inferred file ' 'format [ %s ] is not supported. Please convert ' 'your file to either h5 or h5ad format.' % (self.raw_file, file_format)) # appends -1 -2... to the name of genes that already exist andata.var_names_make_unique() if sp_sparse.issparse(andata.X): andata.X = andata.X.toarray() self.sc_raw = andata
def convert_from_10xh5(path, genome): adata = sc.read_10x_h5(path, genome) adata.var_names_make_unique() df = pandas.DataFrame(adata.X.todense(), index=adata.obs_names, columns=adata.var_names) yield df
def adataToTrainState(filename, refname, num_genes=8000): adata = sc.read_10x_h5(filename, refname) adata.name = filename.split(os.sep)[-3] sc.tl.addCleanObsNames(adata) sc.pp.filter_cells(adata, min_genes=10) mito_genes = [ name for name in adata.var_names if name in [ 'ND1', 'ND2', 'ND4L', 'ND4', 'ND5', 'ND6', 'ATP6', 'ATP8', 'CYTB', 'COX1', 'COX2', 'COX3' ] ] #mito_genes = [name for name in adata.var_names if name.startswith(('MTND','MTCO','MTATP','MTCYB','MTRNR','MTTR',))] ribo_genes = [ name for name in adata.var_names if name.startswith('RPS') or name.startswith('RPL') ] adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = adata.X.sum(axis=1).A1 scanpy.api.pp.filter_genes_dispersion( adata, n_top_genes=min(np.sum(np.sum(adata.X, axis=0) > 0), num_genes)) return (adata)
def adataToTrainStateAggregate(fileList, refname, num_genes=3000): adatas = [] batch_categories = [] for filename in fileList: a = sc.read_10x_h5(filename, refname) sc.pp.filter_cells(a, min_genes=30) a.name = filename.split(os.sep)[-3] a.obs['dataset'] = a.name batch_categories.append(a.name) adatas.append(a) adata = sc.AnnData.concatenate(*adatas, batch_categories=batch_categories) mito_genes = [ name for name in adata.var_names if name in [ 'ND1', 'ND2', 'ND4L', 'ND4', 'ND5', 'ND6', 'ATP6', 'ATP8', 'CYTB', 'COX1', 'COX2', 'COX3' ] ] #mito_genes = [name for name in adata.var_names if name.startswith(('MTND','MTCO','MTATP','MTCYB','MTRNR','MTTR',))] ribo_genes = [ name for name in adata.var_names if name.startswith('RPS') or name.startswith('RPL') ] adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = adata.X.sum(axis=1).A1 scanpy.api.pp.filter_genes_dispersion( adata, n_top_genes=min(np.sum(np.sum(adata.X, axis=0) > 0), num_genes)) return (adata[adata.obs.n_counts.argsort()[::-1], :])
def load_10x_1_3mil(): """ https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons """ filename_data='/data/martin/single_cell/10x_1.3mil_mice_brain/' +\ '1M_neurons_filtered_gene_bc_matrices_h5.h5' data = sc.read_10x_h5(filename_data) return data
def convert( input, output, chunk_size=16 * 1024 * 1024, genome=None, overwrite=False ): input_path, input_ext = splitext(input) output_path, output_ext = splitext(output) print('converting: %s to %s' % (input, output)) if input_ext == '.h5' or input_ext == '.loom': if output_ext == '.zarr': # Convert 10x (HDF5) to Zarr source = h5py.File(input) zarr.tree(source) store = zarr.DirectoryStore(output) dest = zarr.group(store=store, overwrite=overwrite) # following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973 zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True) zarr.tree(dest) elif output_ext == '.h5ad': if not genome: keys = list(h5py.File(input).keys()) if len(keys) == 1: genome = keys[0] else: raise Exception( 'Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s' % (input, ','.join(keys)) ) adata = read_10x_h5(input, genome=genome) # TODO: respect overwrite flag adata.write(output) elif input_ext == '.h5ad': adata = read_h5ad(input, backed='r') (r, c) = adata.shape chunks = (getsize(input) - 1) / chunk_size + 1 chunk_size = (r - 1) / chunks + 1 if output_ext == '.zarr': print('converting %s (%dx%d) to %s in %d chunks (%d rows each)' % (input, r, c, output, chunks, chunk_size)) # TODO: respect overwrite flag adata.write_zarr( make_store(output), chunks=(chunk_size, c) ) else: raise Exception('Unrecognized output extension: %s' % output_ext) else: raise Exception('Unrecognized input extension: %s' % input_ext)
def basic_analysis(filename): adata = sc.read_10x_h5(filename) sc.pp.subsample(adata, fraction=0.1) sc.pp.recipe_zheng17(adata) sc.pp.neighbors(adata, method='rapids') sc.tl.louvain(adata) sc.tl.umap(adata) sc.tl.rank_genes_groups(adata, 'louvain') adata.write('./write/result_130K_gpu_neighbors.h5ad') # plotting sc.pl.umap(adata, color='louvain', save='_130K_gpu_neighbors.png') sc.pl.rank_genes_groups(adata, save='_130K_gpu_neighbors.pdf')
def basic_analysis(filename): adata = sc.read_10x_h5(filename) sc.pp.subsample(adata, fraction=0.1) sc.pp.recipe_zheng17(adata) sc.pp.neighbors(adata, method="rapids") sc.tl.louvain(adata) sc.tl.umap(adata) sc.tl.rank_genes_groups(adata, "louvain") adata.write("./write/result_130K_gpu_neighbors.h5ad") # plotting sc.pl.umap(adata, color="louvain", save="_130K_gpu_neighbors.png") sc.pl.rank_genes_groups(adata, save="_130K_gpu_neighbors.pdf")
def basic_analysis(filename): adata = sc.read_10x_h5(filename) sc.pp.recipe_zheng17(adata) sc.pp.neighbors(adata) sc.tl.louvain(adata) sc.tl.paga(adata) sc.tl.umap(adata) sc.tl.rank_genes_groups(adata, "louvain") adata.write("./write/result.h5ad") # plotting sc.pl.paga(adata) sc.pl.umap(adata, color="louvain") sc.pl.rank_genes_groups(adata, save=".pdf")
def get_genes(self, sce): _transcripts = sce.rowData["hgnc_symbol"] try: adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build) except Exception: adata = sc.read_10x_mtx(self.filtered_matrices()) transcripts = [] for symbol in transcripts: if symbol not in adata.var.index: symbol = symbol.replace(".", "-") if symbol not in adata.var.index: symbol = symbol.split("-") symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1]) if symbol not in adata.var.index: symbol = symbol.split(".")[0] transcripts.append(symbol) return _transcripts
def read_anndata(input, genome=None): _, input_ext = splitext(input) if input_ext == ".h5": if not genome: keys = list(File(input, "r").keys()) if len(keys) == 1: genome = keys[0] else: raise Exception( "Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s" % (input, ",".join(keys))) return read_10x_h5(input, genome=genome) elif input_ext == ".h5ad": return read_h5ad(input) elif input_ext == ".loom": # reads the whole dataset in memory! return read_loom(input) else: raise Exception("Unrecognized input extension: %s" % input_ext)
def gene_map(self, sce, original=False): _transcripts = sce.rowData["Symbol"] try: adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build) except Exception: adata = sc.read_10x_mtx(self.filtered_matrices()) transcripts = {} for symbol in _transcripts: original = symbol if symbol not in adata.var.index: symbol = symbol.replace(".", "-") if symbol not in adata.var.index: symbol = symbol.split("-") symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1]) if symbol not in adata.var.index: symbol = symbol.split(".")[0] if original: transcripts[original] = symbol else: transcripts[symbol] = original return transcripts
#!/usr/bin/env python3 import scanpy.api as sc # Read 10x dataset adata = sc.read_10x_h5("neuron_10k_v3_filtered_feature_bc_matrix.h5") # Make variable names (in this case the genes) unique adata.var_names_make_unique() sc.tl.pca(adata) sc.pl.pca(adata) sc.pp.filter_genes(adata, min_counts=1) sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(adata.X, flavor='cell_ranger', n_top_genes=1000, log=False) adata = adata[:, filter_result.gene_subset] sc.pp.normalize_per_cell(adata) sc.pp.log1p(adata) sc.pp.scale(adata) sc.tl.pca(adata, n_comps=50) sc.pl.pca(adata)
def getAnnData_10x_h5(input_file): adata = sc.read_10x_h5(input_file,"GRCh38") return adata
# coding: utf-8 # In[1]: import pandas as pd import scanpy.api as sc # In[ ]: ica_cord_blood = sc.read_10x_h5( glob.glob( '/projects/sysbio/users/cellAtlas/data/human/immune_census/ica_cord_blood_h5.h5' )[0], "GRCh38") ica_cord_blood.write( '/projects/sysbio/users/cellAtlas/scanpyObjects/ica_cord_blood.h5ad') # In[ ]: ica_bone_marrow = sc.read_10x_h5( glob.glob( '/projects/sysbio/users/cellAtlas/data/human/immune_census/ica_bone_marrow_h5.h5' )[0], "GRCh38") ica_bone_marrow.write( '/projects/sysbio/users/cellAtlas/scanpyObjects/ica_bone_marrow.h5ad')
import numpy as np import pickle import sys sys.path.append('/gpfs01/berens/user/dkobak/FIt-SNE') from fast_tsne import fast_tsne # LOAD AND PREPROCESS THE DATA import scanpy.api as sc sc.settings.verbosity = 2 # Data file is from here # https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons adata = sc.read_10x_h5('big-data/10x/1M_neurons_filtered_gene_bc_matrices_h5.h5') sc.pp.recipe_zheng17(adata) X = np.copy(adata.X) X = X - X.mean(axis=0) U, s, V = np.linalg.svd(X, full_matrices=False) U[:, np.sum(V,axis=1)<0] *= -1 X = np.dot(U, np.diag(s)) X = X[:, np.argsort(s)[::-1]][:,:50] pickle.dump(X, open('big-pickles/10x-pca.pickle', 'wb')) # load cluster labels # https://github.com/theislab/scanpy_usage/blob/master/170522_visualizing_one_million_cells/results/louvain.csv.gz clusters = pd.read_csv('data/10x-1mln-scanpy-louvain.csv.gz', header=None).values[:,1].astype(int)
import matplotlib matplotlib.use('agg') # plotting backend compatible with screen import scanpy.api as sc import pandas as pd import numpy as np import logging import os import velocyto as vcy logging.basicConfig(level=logging.INFO) logging.info("Start") sc.settings.verbosity = 2 sc.settings.autosave = True # save figures, do not show them sc.settings.set_figure_params(dpi=300) # set sufficiently high resolution for saving inputfile = os.path.expanduser('/ye/yelabstore2/mtschmitz/seq/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/outs') # Run louvain clustering on true gene expression values velocityFile = os.path.expanduser('/ye/yelabstore2/mtschmitz/seq/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom') adata = sc.read_10x_h5('/home/mt/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/outs/filtered_gene_bc_matrices_h5.h5','refdata-celranger-Pabe2-toplevel') adata=sc.tl.rna_velocity(adata,os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom'))
import pandas as pd import h5py import scanpy.api as sc import collections import scipy.sparse as sp_sparse import numpy as np import tables # In[169]: # pick one data set here, but run code through for both pick_one_dataset = "ica_cord_blood" pick_one_dataset = "ica_bone_marrow" scanpy_object = sc.read_10x_h5( glob.glob( '/projects/sysbio/users/cellAtlas/data/primary/human/immune_census/' + pick_one_dataset + '_h5.h5')[0], "GRCh38") print(scanpy_object.shape) # # Explore the sample IDs # In[170]: # Donor ID is encoded in sample IDs donor_id = [ y.split("Manton")[1] for y in [x.split("_")[0] for x in scanpy_object.obs.index] ] #print(set(donor_id))
import numpy as np import pickle import sys sys.path.append('/gpfs01/berens/user/dkobak/FIt-SNE') from fast_tsne import fast_tsne # LOAD AND PREPROCESS THE DATA import scanpy.api as sc sc.settings.verbosity = 2 adata = sc.read_10x_h5('1M_neurons_filtered_gene_bc_matrices_h5.h5') sc.pp.recipe_zheng17(adata) X = np.copy(adata.X) X = X - X.mean(axis=0) U, s, V = np.linalg.svd(X, full_matrices=False) U[:, np.sum(V,axis=1)<0] *= -1 X = np.dot(U, np.diag(s)) X = X[:, np.argsort(s)[::-1]][:,:50] pickle.dump(X, open('pca-scanpy.pickle', 'wb')) # CLUSTERING import scanpy.api as sc sc.settings.verbosity = 2 adata = sc.read_10x_h5('1M_neurons_filtered_gene_bc_matrices_h5.h5') sc.pp.recipe_zheng17(adata)
#!/Users/cmdb/miniconda3/bin/python import scanpy.api as sc import sys import matplotlib sc.settings.autoshow = False adata = sc.read_10x_h5(sys.argv[1]) adata.var_names_make_unique() sc.tl.pca(adata, svd_solver='auto') sc.pl.pca(adata, save="PCA_adata.png", title="PCA Before Zheng Pre-processing") # sc.tl.pca(adata, n_comps=50) # var_names1 = adata.var_names # for i in var_names1: # print(adata.var_names[i]) # if adata.var_names[str(var_names1[i])]: # print(adata.var_names[i]) # sc.pl.pca(adata, save="PCA_adata.png") # sc.tl.tsne(adata) # sc.pl.tsne(adata, save="tSNE_adata.png") # sc.pp.neighbors(adata) # sc.tl.umap(adata) # sc.pl.umap(adata, save="uMAP_adata.png")
filename = "home/1M_neurons_matrix_subsampled_25K.h5" elif len(sys.argv) > 1: # default output file filename = sys.argv[1] if len(sys.argv) == 3: verbose = int(sys.argv[2]) start = datetime.datetime.now() sc.settings.verbosity = verbose # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() print("Reading, making names unique and filtering:") now = datetime.datetime.now() adata = sc.read_10x_h5(filename) adata.var_names_make_unique() # this is unnecessary if using 'gene_ids' sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) print("Reading, making names unique and filtering Time:", datetime.datetime.now() - now) print("Log Normalizing Data") now = datetime.datetime.now() sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e3) sc.pp.log1p(adata) print("Log Normalizing Data Time:", datetime.datetime.now() - now) adata.raw = adata print("Finding Variable Features")
from scio import concatenate, check_obs, check_var DATASET = "vanderburg_01" OBS_DATA = "tables/{}_samples.csv".format(DATASET) OUTPUT_DIR = "./" obs = pd.read_csv(OBS_DATA) dataset_samples = obs["samples"].values filenames = [ "data/cellranger/{}_GEX/outs/raw_feature_bc_matrix.h5".format(sample[1:]) for sample in dataset_samples ] adatas = [sc.read_10x_h5(filename, genome="GRCh38") for filename in filenames] adatas2 = [] for adata, sample in zip(adatas, dataset_samples): duplicated = adata.var_names.duplicated() print("Removing {} gene symbols because they are duplicated".format( sum(duplicated))) adata = adata[:, ~duplicated].copy() adata.obs['samples'] = sample adatas2.append(adata) adata = concatenate(adatas2, merge_var_cols=["gene_ids"]) adata.obs = adata.obs.join(obs.set_index("samples"), on="samples", how="left") adata.obs["dataset"] = DATASET