def test_read_10x_h5(): sc.read_10x_h5(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'), genome='hg19_chr21') sc.read_10x_h5(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5'), genome='GRCh38_chr21')
def test_error_missing_genome(): legacy_pth = ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5' v3_pth = ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5' with pytest.raises(ValueError, match=r".*hg19_chr21.*"): sc.read_10x_h5(legacy_pth, genome="not a genome") with pytest.raises(ValueError, match=r".*GRCh38_chr21.*"): sc.read_10x_h5(v3_pth, genome="not a genome")
def test_read_10x_h5_v1(): spec_genome_v1 = sc.read_10x_h5(os.path.join( ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'), genome='hg19_chr21') nospec_genome_v1 = sc.read_10x_h5( os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5')) assert_anndata_equal(spec_genome_v1, nospec_genome_v1)
def test_read_10x_h5(): spec_genome_v3 = sc.read_10x_h5(os.path.join( ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5'), genome='GRCh38_chr21') nospec_genome_v3 = sc.read_10x_h5( os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5')) assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
def test_read_visium(): spec_genome_v3 = sc.read_10x_h5( ROOT / 'visium' / 'V1_Human_Heart_subsampled.h5', genome='GRCh38', ) nospec_genome_v3 = sc.read_10x_h5(ROOT / 'visium' / 'V1_Human_Heart_subsampled.h5') assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
def test_read_10x_h5(): spec_genome_v3 = sc.read_10x_h5( ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5', genome='GRCh38_chr21', ) nospec_genome_v3 = sc.read_10x_h5(ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5') assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
def test_read_10x_h5_v1(): spec_genome_v1 = sc.read_10x_h5( ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5', genome='hg19_chr21', ) nospec_genome_v1 = sc.read_10x_h5(ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5') assert_anndata_equal(spec_genome_v1, nospec_genome_v1)
def read_files(mousefile, humanfile): """ Read into anndata objects and return """ mouse = sc.read_10x_h5(mousefile) if humanfile != None: human = sc.read_10x_h5(humanfile) else: human = None return (mouse, human)
def test_error_10x_h5_legacy(tmp_path): onepth = ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5' twopth = tmp_path / "two_genomes.h5" with h5py.File(onepth, "r") as one, h5py.File(twopth, "w") as two: one.copy("hg19_chr21", two) one.copy("hg19_chr21", two, name="hg19_chr21_copy") with pytest.raises(ValueError): sc.read_10x_h5(twopth) sc.read_10x_h5(twopth, genome="hg19_chr21_copy")
def check_inputs(mousefile, humanfile, cutoff_top, cutoff_bottom): """ Make sure you have two matrices: one from mouse, one from human. mouse should be aligned to mm10hg19 and human should be hg38 """ if cutoff_top > 1 or cutoff_top < 0: raise NameError('cutoff has to be within 0 < cutoff < 1') if cutoff_bottom > 1 or cutoff_bottom < 0: raise NameError('cutoff has to be within 0 < cutoff < 1') try: sc.read_10x_h5(mousefile) if humanfile != None: sc.read_10x_h5(humanfile) except: print('the files are not readable as h5.')
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print('test.py -i <inputfile.h5> -o <outputfile.h5ad>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg print('Input file is "', inputfile) print('Output file is "', outputfile) adata = sc.read_10x_h5( inputfile # var_names='gene_ids', # make_unique=False, # var_names='gene_symbols' ) adata.var_names_make_unique() adata.write(outputfile) # , compression='gzip')
def read_cellranger(fn, args, rm_zero_cells=True, add_sample_id=True, **kw): """read cellranger results Assumes the Sample_ID may be extracted from cellranger output dirname, e.g ` ... /Sample_ID/outs/filtered_feature_bc_matrix.h5 ` """ if fn.endswith('.h5'): dirname = os.path.dirname(fn) data = sc.read_10x_h5(fn) data.var['gene_symbols'] = list(data.var_names) data.var_names = list(data.var['gene_ids']) else: mtx_dir = os.path.dirname(fn) dirname = os.path.dirname(mtx_dir) data = sc.read_10x_mtx(mtx_dir, gex_only=args.gex_only, var_names='gene_ids') data.var['gene_ids'] = list(data.var_names) if add_sample_id: barcodes = [b.split('-')[0] for b in data.obs.index] if len(barcodes) == len(set(barcodes)): data.obs_names = barcodes sample_id = os.path.basename(os.path.dirname(dirname)) data.obs['sample_id'] = sample_id data.obs['sample_id'] = data.obs['sample_id'].astype('category') data.obs_names = [i + '-' + sample_id for i in data.obs_names] return data
def read_sample( sample_meta, min_genes=200, min_cells=5, ): sample = sample_meta.Sample sample_dir = sample_meta.__dir ds = sc.read_10x_h5( os.path.join(sample_dir, sample, "outs", "filtered_feature_bc_matrix.h5")) ds.var_names = rename_genes(ds.var_names) ds.var_names_make_unique(join=".") ds.obs_names = sample + "_" + ds.obs_names.str.replace( "-\d$", "", regex=True) sc.pp.filter_cells(ds, min_genes=min_genes) sc.pp.filter_genes(ds, min_cells=min_cells) if "__filter_cells" in sample_meta and type( sample_meta.__filter_cells) == str: sample_cells = pd.read_csv(sample_meta.__filter_cells, index_col=0) sample_cells.index = sample_cells.index.str.replace("-\d+$", "", regex=True) ds = ds[ds.obs_names.isin(sample_cells.index), :] ds.layers["counts"] = ds.X for k, v in sample_meta.iteritems(): if k.startswith("__"): continue ds.obs[k] = v return ds
def run(args): """Compile an AnnData object from a 10X h5 file """ # Parse options... options = args # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error # take options h5_fname = options.ifile out_fname = options.ofile # read adata = sc.read_10x_h5(h5_fname) # add n_counts and n_kmers to adata.obs adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1 adata.obs['n_kmers'] = np.sum(adata.X > 0, axis=1).A1 # add n_cells to adata.var adata.var['n_cells'] = np.sum(adata.X > 0, axis=0).A1 # save adata to h5ad adata.write_h5ad(filename=out_fname) return
def run_scrublet(tenx_h5, doublet_rate=0.06, npca=40, save_to=None): if not save_to: raise ValueError( "Please, specify prefix path where to save results to") if tenx_h5.endswith(".h5"): ds = sc.read_10x_h5(tenx_h5) counts_matrix = ds.X.tocsc().astype(np.longlong) obs = ds.obs.reset_index() obs.columns = ["0"] else: counts_matrix = scipy.io.mmread(gzip.open(tenx_h5 + '/matrix.mtx.gz')).T.tocsc() obs = pd.read_table(gzip.open(tenx_h5 + '/barcodes.tsv.gz'), header=None) #features = pd.read_table(gzip.open(input_dir + '/features.tsv.gz'), header=None) #genes = scr.make_genes_unique(features[1]) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=doublet_rate) doublet_scores, doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=npca) save_dir = os.path.dirname(save_to) if not os.path.exists(save_dir): os.makedirs(save_dir) obs['doublet'] = doublet_scores obs.to_csv(save_to + 'doublets.csv') scrub.plot_histogram() plt.savefig(save_to + 'doublet_hist.pdf') if not os.path.exists(save_to + 'threshold.txt'): with open(save_to + 'threshold.txt', 'w') as f: f.write(str(scrub.threshold_))
def read_10x_data(input_file, format_type='10x_h5', backed=None, transpose=False, sparse=False): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) elif format_type == "10x_txt": adata = sc.read_csv(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') adata.var_names_make_unique() adata.obs_names_make_unique() return adata
def read_adata( gex_data, # filename gex_data_type # string describing file type ): ''' Split this out so that other code can use it. Read GEX data ''' print('reading:', gex_data, 'of type', gex_data_type) if gex_data_type == 'h5ad': adata = sc.read_h5ad( gex_data ) elif gex_data_type == '10x_mtx': adata = sc.read_10x_mtx( gex_data ) elif gex_data_type == '10x_h5': adata = sc.read_10x_h5( gex_data, gex_only=True ) elif gex_data_type == 'loom': adata = sc.read_loom( gex_data ) else: print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']") exit() if adata.isview: # this is so weird adata = adata.copy() return adata
def test_read_10x(tmp_path, mtx_path, h5_path, prefix): if prefix is not None: # Build files named "prefix_XXX.xxx" in a temporary directory. mtx_path_orig = mtx_path mtx_path = tmp_path / "filtered_gene_bc_matrices_prefix" mtx_path.mkdir() for item in mtx_path_orig.iterdir(): if item.is_file(): shutil.copyfile(item, mtx_path / f"{prefix}{item.name}") mtx = sc.read_10x_mtx(mtx_path, var_names="gene_symbols", prefix=prefix) h5 = sc.read_10x_h5(h5_path) # Drop genome column for comparing v3 if "3.0.0" in str(h5_path): h5.var.drop(columns="genome", inplace=True) # Check equivalence assert_anndata_equal(mtx, h5) # Test that it can be written: from_mtx_pth = tmp_path / "from_mtx.h5ad" from_h5_pth = tmp_path / "from_h5.h5ad" mtx.write(from_mtx_pth) h5.write(from_h5_pth) assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
def _load_expression(self, clustering, tsne, diffexp): expression_file_v3 = os.path.join(self.args.indir, "filtered_feature_bc_matrix.h5") expression_file_v2 = os.path.join(self.args.indir, "filtered_gene_bc_matrices_h5.h5") if os.path.isfile(expression_file_v3): expression_file = expression_file_v3 elif os.path.isfile(expression_file_v2): expression_file = expression_file_v2 else: raise ScelVisException("cannot find expression file at %s" % self.args.indir) logger.info("Reading gene expression from %s", expression_file) with with_log_level(anndata.utils.logger, logging.WARN): ad = sc.read_10x_h5(expression_file) ad.var_names_make_unique() logger.info("Combining meta data") ad.obs["cluster"] = clustering ad.obs["n_counts"] = ad.X.sum(1).A1 ad.obs["n_genes"] = (ad.X > 0).sum(1).A1 logger.info("Adding coordinates") ad.obsm["X_tsne"] = tsne.values logger.info("Saving top %d markers per cluster", self.args.nmarkers) markers = (diffexp[(diffexp["p_adj"] < 0.05) & (diffexp["log2_fc"] > 0)].drop( "GeneID", axis=1).sort_values([ "Cluster", "p_adj" ]).groupby("Cluster").head(self.args.nmarkers)) for col in markers.columns: ad.uns["marker_" + col] = markers[col].values return ad
def main(): parser = build_parser() args = parser.parse_args() _, input_ext = os.path.splitext(args.input) if input_ext == ".h5ad": x = ad.read_h5ad(args.input) elif input_ext == ".h5": x = sc.read_10x_h5(args.input) else: raise ValueError(f"Unrecognized file extension: {args.input}") logging.info(f"Read input: {x}") logging.info("Reading gtf for gene name map") gene_name_map = utils.read_gtf_gene_symbol_to_id() # Tranpose because BIRD wants features x obs x_df = pd.DataFrame(utils.ensure_arr(x.X), index=x.obs_names, columns=x.var_names).T assert np.all(x_df.values >= 0.0) x_df.index = [gene_name_map[g] for g in x_df.index] # Write output (tab-separated table logging.info(f"Writing output to {args.output_table_txt}") x_df.to_csv(args.output_table_txt, sep="\t")
def Create_Scanpy_Anndata(storage_mount_point, sampleID, annotation_dict): ''' In: storage_mount_point: Data storage mount location sampleID: ID numbers of samples from the metadata table (ex: 2235-1) annotation_dict: Dictionary of all the sample IDs and metadata Out: New filled AnnData object ''' metadata_list = annotation_dict[sampleID][1:] newAdata = sc.read_10x_h5(''.join( [storage_mount_point, annotation_dict[sampleID][0]])) # genome='hg19' or genome='GRCh38' ## Set gene names to be unique since there seem to be duplicate names from Cellranger newAdata.var_names_make_unique() ## Add metadata for each sample to the observation (cells) annotations in the Anndata objects print('\nAdding Metadata to individual samples.\n') for field in metadata_list: field_list = str.split(field, ':') meta_name = field_list[0] meta_value = field_list[1] newAdata.obs[meta_name] = meta_value return (newAdata)
def load_ds(path): """ H5 files are named like this GSM4698176_Sample_1_filtered_feature_bc_matrix.h5 Fastcar outputs are in …fastcar/{sample}/matrix.mtx.gz """ fname = os.path.basename(path) if os.path.exists(os.path.join(path, "matrix.mtx.gz")): sample = fname ds = sc.read_10x_mtx(path) elif fname.endwith(".h5"): sample = "_".join(fname.split("_")[1:3]) ds = sc.read_10x_h5(path) else: raise ValueError(f"Unknown input path {path}") ds.var_names = rename_genes(ds.var_names) ds.var_names_make_unique(join=".") ds.obs["orig.ident"] = sample ds.obs_names = sample + "_" + ds.obs_names.str.replace("-\d$", "") sc.pp.filter_cells(ds, min_genes=200) sc.pp.filter_genes(ds, min_cells=3) meta = SAMPLES.loc[SAMPLES.Sample == sample, :] ds.obs["Patient"] = meta.Patient.values[0] ds.obs["Day of intubation"] = meta["Day of intubation"].values[0] ds.obs["COVID-19"] = meta["COVID-19"].values[0] ds.obs["Sample"] = sample return ds
def read_10x(input_10x_h5, input_10x_mtx, genome='hg19', var_names='gene_symbols', extra_obs=None, extra_var=None): """ Wrapper function for sc.read_10x_h5() and sc.read_10x_mtx(), mainly to support adding extra metadata """ if input_10x_h5 is not None: adata = sc.read_10x_h5(input_10x_h5, genome=genome) elif input_10x_mtx is not None: adata = sc.read_10x_mtx(input_10x_mtx, var_names=var_names) if extra_obs: obs_tbl = pd.read_csv(extra_obs, sep='\t', header=0, index_col=0) adata.obs = adata.obs.merge( obs_tbl, how='left', left_index=True, right_index=True, suffixes=(False, False), ) if extra_var: var_tbl = pd.read_csv(extra_var, sep='\t', header=0, index_col=0) adata.var = adata.var.merge( var_tbl, how='left', left_index=True, right_index=True, suffixes=(False, False), ) return adata
def test_filter(): adata = sc.read_10x_h5("tests/sctest.h5") test = ddl.read_h5("tests/test.h5") adata.obs["filter_rna"] = False test, adata = ddl.pp.filter_bcr(test, adata) adata.write("tests/sctest.h5ad", compression="gzip") test.write_h5("tests/test.h5", compression="bzip2") print(test)
def test_read_10x_v1(): v1_mtx = sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices', 'hg19_chr21'), var_names='gene_symbols') v1_h5 = sc.read_10x_h5( os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5')) assert_anndata_equal(v1_mtx, v1_h5)
def test_read_10x_v3(): v3_mtx = sc.read_10x_mtx( ROOT / '3.0.0' / 'filtered_feature_bc_matrix', var_names='gene_symbols', ) v3_h5 = sc.read_10x_h5(ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5') v3_h5.var.drop(columns="genome", inplace=True) assert_anndata_equal(v3_mtx, v3_h5)
def from_10x_HDF5(filename, genome=None): ad = sc.read_10x_h5(filename, genome, True) dataMatrix = pd.DataFrame(ad.X.todense(), columns=ad.var_names, index=ad.obs_names) return _clean_up(dataMatrix)
def from_matrix_h5(cls, path_to_matrix): """ Factory function for reading gene expression matrix from hdf5 formatted 10x output. """ anndata_matrix = scanpy.read_10x_h5(path_to_matrix, gex_only=True) anndata_matrix.var_names_make_unique() return cls(anndata_matrix)
def read_10x_h5(filename: PathLike, extended: bool = True, *args, **kwargs) -> MuData: """ Read data from 10X Genomics-formatted HDF5 file This function uses scanpy.read_10x_h5() internally and patches its behaviour to: - attempt to read `interval` field for features; - attempt to locate peak annotation file and add peak annotation; - attempt to locate fragments file. Parameters ---------- filename : str Path to 10X HDF5 file (.h5) extended : bool, optional (default: True) Perform extended functionality automatically such as locating peak annotation and fragments files. """ adata = sc.read_10x_h5(filename, gex_only=False, *args, **kwargs) # Patches sc.read_10x_h5 behaviour to: # - attempt to read `interval` field for features from the HDF5 file # - attempt to add peak annotation # - attempt to locate fragments file if extended: # 1) Read interval field from the HDF5 file h5file = h5py.File(filename, "r") if "interval" in h5file["matrix"]["features"]: intervals = np.array( h5file["matrix"]["features"]["interval"]).astype(str) h5file.close() adata.var["interval"] = intervals print(f"Added `interval` annotation for features from {filename}") else: # Make sure the file is closed h5file.close() mdata = MuData(adata) if extended: if "atac" in mdata.mod: initialise_default_files(mdata, filename) return mdata
def basic_analysis(filename): adata = sc.read_10x_h5(filename) sc.pp.recipe_zheng17(adata) sc.pp.neighbors(adata) sc.tl.louvain(adata) sc.tl.umap(adata) sc.tl.rank_genes_groups(adata, "louvain") adata.write("./write/result.h5ad") # plotting sc.pl.umap(adata, color="louvain", save=".png") sc.pl.rank_genes_groups(adata, save=".pdf")