def test_read_10x_mtx(): sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices', 'hg19_chr21'), var_names='gene_symbols', cache=True) sc.read_10x_mtx(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix'), var_names='gene_symbols', cache=True)
def _read_raw_dataset(self): # sars_data_path = next(self._data_path_sars.glob('filtered_feature_bc_matrix')) # mock_data_path = next(self._data_path_mock.glob('filtered_feature_bc_matrix')) # sars_dataset = sc.read_10x_mtx(sars_data_path, cache=True) # mock_dataset = sc.read_10x_mtx(mock_data_path, cache=True) # dataset = sars_dataset.concatenate(mock_dataset, batch_categories=['SARS2', 'MOCK'], index_unique=None) data_files = glob.glob(str(self._data_path / 'data/*/filtered_feature_bc_matrix')) sample_suffix = '_MON_crispr' # first load adatas = {} for i, file in enumerate(data_files): if i==0: adata = sc.read_10x_mtx(file, cache=True) batch_key = file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0] adata.var_names_make_unique() else: adatas[file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0]] = sc.read_10x_mtx(file, cache=True) adatas[file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0]].var_names_make_unique() adata = adata.concatenate(*adatas.values(), batch_categories=[batch_key]+list(adatas.keys()), index_unique=None) del adatas # drop sars-cov-2 counts adata = adata[:, :-1] # drop suffixes from index adata.obs.index = adata.obs.index.str.replace('-SARS2', '').str.replace('-MOCK', '') # drop duplicate index adata = adata[~adata.obs.index.duplicated(keep=False)] return adata
def join_SLAM(bdata, letter): for time in ['old', 'new']: adata = sc.read_10x_mtx('/fast/scratch/groups/ag_bluethgen/count' + letter + '/' + time + '_matrix', make_unique=False) adata.var_names_make_unique() # intersect vars bdata = bdata[:, np.isin(bdata.var_names, adata.var_names)].copy() adata = adata[:, np.isin(adata.var_names, bdata.var_names)].copy() # intersect obs (filtered) new_index = [index[:-2] for index in adata.obs_names] # clean index names adata.obs_names = new_index adata = adata[np.isin(adata.obs_names, bdata.obs_names), :].copy() # align adata = adata[:, np.argsort(adata.var_names)].copy() bdata = bdata[:, np.argsort(bdata.var_names)].copy() adata = adata[np.argsort(adata.obs_names), :].copy() bdata = bdata[np.argsort(bdata.obs_names), :].copy() print('adata ', adata.n_obs, adata.n_obs) print('bdata ', bdata.n_obs, bdata.n_obs) # add bdata.layers[time] = adata.X # overwrite u and s, then redo pool, then normalize, but keep umap. No filtering bdata.layers['unspliced'] = bdata.layers['new'] bdata.layers['spliced'] = bdata.layers['old'] scv.pp.neighbors(bdata) scv.pp.moments(bdata) # pool, normalize return bdata
def read_10x(input_10x_h5, input_10x_mtx, genome='hg19', var_names='gene_symbols', extra_obs=None, extra_var=None): """ Wrapper function for sc.read_10x_h5() and sc.read_10x_mtx(), mainly to support adding extra metadata """ if input_10x_h5 is not None: adata = sc.read_10x_h5(input_10x_h5, genome=genome) elif input_10x_mtx is not None: adata = sc.read_10x_mtx(input_10x_mtx, var_names=var_names) if extra_obs: obs_tbl = pd.read_csv(extra_obs, sep='\t', header=0, index_col=0) adata.obs = adata.obs.merge( obs_tbl, how='left', left_index=True, right_index=True, suffixes=(False, False), ) if extra_var: var_tbl = pd.read_csv(extra_var, sep='\t', header=0, index_col=0) adata.var = adata.var.merge( var_tbl, how='left', left_index=True, right_index=True, suffixes=(False, False), ) return adata
def load_ds(path): """ H5 files are named like this GSM4698176_Sample_1_filtered_feature_bc_matrix.h5 Fastcar outputs are in …fastcar/{sample}/matrix.mtx.gz """ fname = os.path.basename(path) if os.path.exists(os.path.join(path, "matrix.mtx.gz")): sample = fname ds = sc.read_10x_mtx(path) elif fname.endwith(".h5"): sample = "_".join(fname.split("_")[1:3]) ds = sc.read_10x_h5(path) else: raise ValueError(f"Unknown input path {path}") ds.var_names = rename_genes(ds.var_names) ds.var_names_make_unique(join=".") ds.obs["orig.ident"] = sample ds.obs_names = sample + "_" + ds.obs_names.str.replace("-\d$", "") sc.pp.filter_cells(ds, min_genes=200) sc.pp.filter_genes(ds, min_cells=3) meta = SAMPLES.loc[SAMPLES.Sample == sample, :] ds.obs["Patient"] = meta.Patient.values[0] ds.obs["Day of intubation"] = meta["Day of intubation"].values[0] ds.obs["COVID-19"] = meta["COVID-19"].values[0] ds.obs["Sample"] = sample return ds
def test_recipe(): try: adata = sc.datasets.pbmc3k() except: fname = 'pbmc3k_filtered_gene_bc_matrices.tar.gz' url = 'http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/' + fname r = requests.get(url, stream=True) if r.status_code == 200: with open('filtered_gene_bc_matrices.tar.gz', 'wb') as f: f.write(r.raw.read()) os.system('tar -xzvf filtered_gene_bc_matrices.tar.gz') adata = sc.read_10x_mtx('filtered_gene_bc_matrices/hg19') _adata = adata.copy() ddl.pp.external.recipe_scanpy_qc(_adata) assert not _adata.obs['filter_rna'].empty _adata = adata.copy() ddl.pp.external.recipe_scanpy_qc( _adata, mito_cutoff=None) # weird segmentation fault in the tests assert not _adata.obs['gmm_pct_count_clusters_keep'].empty _adata = adata.copy() ddl.pp.external.recipe_scanpy_qc(_adata, min_counts=100, max_counts=20000) _adata = adata.copy() ddl.pp.external.recipe_scanpy_qc(_adata, min_counts=100) _adata = adata.copy() ddl.pp.external.recipe_scanpy_qc(_adata, max_counts=20000)
def read_file(filename, transpose=False): adata = None if os.path.exists(filename): if os.path.isdir(filename): adata = sc.read_10x_mtx(filename) elif os.path.isfile(filename): name, filetype = os.path.splitext(filename) if filetype == ".txt": print() adata = sc.read_text(filename) if filetype == ".csv": adata = sc.read_csv(filename) if filetype == ".h5ad": adata = sc.read(filename) else: print( "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory." ) sys.exit() if transpose: adata = adata.transpose() elif not os.path.exists(filename): sys.exit("ERROR: no such file or directory.") if not isinstance(adata.X, np.ndarray): X = adata.X.toarray() adata = anndata.AnnData(X, obs=adata.obs, var=adata.var) return adata
def load_data(data): if isfile(data): name, extension = splitext(data) if extension == ".h5ad": adata = sc.read_h5ad(data) elif extension == ".loom": adata = sc.read_loom(data) else: raise click.FileError(data, hint="does not have a valid extension [.h5ad | .loom]") elif isdir(data): if not data.endswith(sep): data += sep adata = sc.read_10x_mtx(data) else: raise click.FileError(data, hint="not a valid file or path") if not set_obs_names == "": if set_obs_names not in adata.obs_keys(): raise click.UsageError(f"obs {set_obs_names} not found, options are: {adata.obs_keys()}") adata.obs_names = adata.obs[set_obs_names] if not set_var_names == "": if set_var_names not in adata.var_keys(): raise click.UsageError(f"var {set_var_names} not found, options are: {adata.var_keys()}") adata.var_names = adata.var[set_var_names] if make_obs_names_unique: adata.obs.index = make_index_unique(adata.obs.index) if make_var_names_unique: adata.var.index = make_index_unique(adata.var.index) if not adata._obs.index.is_unique: click.echo("Warning: obs index is not unique") if not adata._var.index.is_unique: click.echo("Warning: var index is not unique") return adata
def read_10x_data(input_file, format_type='10x_h5', backed=None, transpose=False, sparse=False): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) elif format_type == "10x_txt": adata = sc.read_csv(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') adata.var_names_make_unique() adata.obs_names_make_unique() return adata
def test_read_10x(tmp_path, mtx_path, h5_path, prefix): if prefix is not None: # Build files named "prefix_XXX.xxx" in a temporary directory. mtx_path_orig = mtx_path mtx_path = tmp_path / "filtered_gene_bc_matrices_prefix" mtx_path.mkdir() for item in mtx_path_orig.iterdir(): if item.is_file(): shutil.copyfile(item, mtx_path / f"{prefix}{item.name}") mtx = sc.read_10x_mtx(mtx_path, var_names="gene_symbols", prefix=prefix) h5 = sc.read_10x_h5(h5_path) # Drop genome column for comparing v3 if "3.0.0" in str(h5_path): h5.var.drop(columns="genome", inplace=True) # Check equivalence assert_anndata_equal(mtx, h5) # Test that it can be written: from_mtx_pth = tmp_path / "from_mtx.h5ad" from_h5_pth = tmp_path / "from_h5.h5ad" mtx.write(from_mtx_pth) h5.write(from_h5_pth) assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
def read_cellranger(fn, args, rm_zero_cells=True, add_sample_id=True, **kw): """read cellranger results Assumes the Sample_ID may be extracted from cellranger output dirname, e.g ` ... /Sample_ID/outs/filtered_feature_bc_matrix.h5 ` """ if fn.endswith('.h5'): dirname = os.path.dirname(fn) data = sc.read_10x_h5(fn) data.var['gene_symbols'] = list(data.var_names) data.var_names = list(data.var['gene_ids']) else: mtx_dir = os.path.dirname(fn) dirname = os.path.dirname(mtx_dir) data = sc.read_10x_mtx(mtx_dir, gex_only=args.gex_only, var_names='gene_ids') data.var['gene_ids'] = list(data.var_names) if add_sample_id: barcodes = [b.split('-')[0] for b in data.obs.index] if len(barcodes) == len(set(barcodes)): data.obs_names = barcodes sample_id = os.path.basename(os.path.dirname(dirname)) data.obs['sample_id'] = sample_id data.obs['sample_id'] = data.obs['sample_id'].astype('category') data.obs_names = [i + '-' + sample_id for i in data.obs_names] return data
def read_adata( gex_data, # filename gex_data_type # string describing file type ): ''' Split this out so that other code can use it. Read GEX data ''' print('reading:', gex_data, 'of type', gex_data_type) if gex_data_type == 'h5ad': adata = sc.read_h5ad( gex_data ) elif gex_data_type == '10x_mtx': adata = sc.read_10x_mtx( gex_data ) elif gex_data_type == '10x_h5': adata = sc.read_10x_h5( gex_data, gex_only=True ) elif gex_data_type == 'loom': adata = sc.read_loom( gex_data ) else: print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']") exit() if adata.isview: # this is so weird adata = adata.copy() return adata
def make_raw_dataset(samples, path, name): """ Function to load, preprocess and concatenate a dataset from multiple RNAseq samples Inputs: samples, dictionary of sample file prefixes as keys and timepoint metadata as values path, path to directory containing sample files name, dataset name for labeling AnnData object metadata Output: AnnData object of concatenated samples, annotated with dataset, timepoint, and sample id labels """ anndata_dict = {} for sm in samples.keys(): print(sm) # read in data from GEO file data = sc.read_10x_mtx(path, prefix=sm, cache=True) # add metadata information data.obs['dataset'] = name data.obs['timepoint'] = samples[sm] # add to dict for concatenation anndata_dict[sm] = data # concatenate samples data_full = ad.concat(anndata_dict, join='outer', label='sample id', index_unique='_', fill_value=0.0) return data_full
def read_10x(file): # read 10x data # verbosity: errors (0), warnings (1), info (2), hints (3) sc.settings.verbosity = 3 sc.logging.print_header() sc.settings.set_figure_params(dpi=80, facecolor='white') # the file that will store the analysis results results_file = 'write/pbmc3k.h5ad' # os.chdir(file) dir_name = '' path10x = file for i in os.listdir(file): if (i[:6] == 'filter'): dir_name = i if (os.listdir(file + '/' + dir_name)[0] == 'hg19'): path10x = file + '/' + dir_name + '/hg19/' # the directory with the `.mtx` file else: path10x = file + '/' + dir_name + '/' # the directory with the `.mtx` file adata = sc.read_10x_mtx( path10x, # use gene symbols for the variable names (variables-axis index) var_names='gene_symbols', cache=True) adata.var_names_make_unique() # os.chdir('../../..') adata.uns['dataset'] = 'data10x' return AnnData(adata)
def read_data(inp_path, out_path): global adata adata = sc.read_10x_mtx( inp_path, # the directory with the `.mtx` file var_names= 'gene_symbols', # use gene symbols for the variable names (variables-axis index) cache=True)
def open_10_genomics_data(dir): """ Open a 10x genomics data into an annadata object. """ adata = sc.read_10x_mtx(dir, var_names='gene_symbols', cache=True) adata.var_names_make_unique() return adata
def test_read_10x_v1(): v1_mtx = sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices', 'hg19_chr21'), var_names='gene_symbols') v1_h5 = sc.read_10x_h5( os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5')) assert_anndata_equal(v1_mtx, v1_h5)
def load_data(path, dtype='dge'): if dtype == 'dge': dataset = ad.read_text(path) elif dtype == '10x': dataset = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True) return dataset
def test_read_10x_v3(): v3_mtx = sc.read_10x_mtx( ROOT / '3.0.0' / 'filtered_feature_bc_matrix', var_names='gene_symbols', ) v3_h5 = sc.read_10x_h5(ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5') v3_h5.var.drop(columns="genome", inplace=True) assert_anndata_equal(v3_mtx, v3_h5)
def get_adata(matrix_path): """ Convert 10x Feature-Barcode Matrix file to AnnData :param str matrix_path: path to 10x feature-barcode matrix file :return anndata: Anndata object """ sc.settings.verbosity = 3 adata = sc.read_10x_mtx(matrix_path, var_names='gene_symbols', cache=True) adata.var_names_make_unique() return adata
def from_matrix_mtx(cls, path_to_matrix): """ Factory function for reading gene expression matrix from mtx formatted 10x output. """ anndata_matrix = scanpy.read_10x_mtx(path_to_matrix, gex_only=True) anndata_matrix.var_names_make_unique() return cls(anndata_matrix)
def get_file(source, accession, matrixFile): matrix = zipfile.ZipFile(matrixFile.file, 'r') #matrix = zipfile.ZipFile(matrixFile, 'r') matrix.extractall("/tmp") filePath = "/tmp/" + source + "/" + accession adata = sc.read_10x_mtx(filePath, var_names="gene_symbols", cache=True) #shutil.rmtree(filePath) return adata
def read_10x_mtx(filename: PathLike, atac_only: bool = True, *args, **kwargs) -> AnnData: adata = sc.read_10x_mtx(filename, gex_only=False, *args, **kwargs) if atac_only: adata = adata[:, list( map(lambda x: x == "Peaks", adata. var["feature_types"]))] return adata
def read_10x_mtx(filename: PathLike, prot_only: bool = True, *args, **kwargs) -> AnnData: adata = sc.read_10x_mtx(filename, gex_only=False, *args, **kwargs) if prot_only: adata = adata[:, list( map(lambda x: x == "Antibody Capture", adata. var["feature_types"]))].copy() return adata
def test_pbmc_cite(save_path): file_path = os.path.join( save_path, "10X/pbmc_10k_protein_v3/filtered_feature_bc_matrix.tar.gz") sp = os.path.join(save_path, "10X/pbmc_10k_protein_v3/") tar = tarfile.open(file_path, "r:gz") tar.extractall(path=sp) tar.close() dataset = sc.read_10x_mtx(os.path.join(sp, "filtered_feature_bc_matrix"), gex_only=False) organize_cite_seq_10x(dataset) unsupervised_training_one_epoch(dataset)
def generate_adata_from_10X(session_ID, data_type="10X_mtx"): data_dir = save_analysis_path + str(session_ID) + "/raw_data/" if (data_type == "10X_mtx"): adata = sc.read_10x_mtx(data_dir, cache=False) elif (data_type == "10X_h5"): adata = sc.read_10x_h5(data_dir + "data.h5ad") else: print("[ERROR] data type not recognized - returning None") return None cache_adata(session_ID, adata) return adata
def load3k(cells: 'mito all seurat' = 'mito', subsample=.15, seed=None) -> 'anndata object': adata = sc.read_10x_mtx('../data/3k/hg19/', var_names='gene_symbols', cache=True) adata.obs['labels'] = loadlabels(load("../data/3k/pbmc.3k.labels"), load("../data/3k/hg19/barcodes.tsv")) adata = filter(adata, cells) adata = do_subsample(adata, subsample, seed) return adata
def load_anndata_from_input_and_output( input_dir_10x: str, cellbender_h5: str, analyzed_barcodes_only: bool = True) -> 'anndata.AnnData': """Load remove-background output count matrix into an anndata object. Args: input_dir: Raw 10x dir. output_file: Output h5 file created by remove-background (can be filtered or not). analyzed_barcodes_only: Argument passed to anndata_from_h5(). False to load all barcodes, so that the size of the AnnData object will match the size of the input raw count matrix. True to load a limited set of barcodes: only those analyzed by the algorithm. This allows relevant latent variables to be loaded properly into adata.obs and adata.obsm, rather than adata.uns. Return: adata_out: AnnData object with counts before and after remove-background, as well as inferred latent variables from remove-background. """ # Load input data. adata_10x = sc.read_10x_mtx( path=input_dir_10x, # var_names='gene_symbols', var_names='gene_ids', make_unique=False) # Load remove-background output data. # We need to do this because of bug here: # https://github.com/broadinstitute/CellBender/issues/57 adata_out = anndata_from_h5(cellbender_h5, analyzed_barcodes_only=analyzed_barcodes_only) # Subset the raw dataset to the relevant barcodes. adata_10x = adata_10x[adata_out.obs.index] # Put count matrices into 'layers' in anndata for clarity. adata_out.layers['counts_raw'] = adata_10x.X.copy() adata_out.layers['counts_cellbender'] = adata_out.X.copy() # Pre-compute a bit of metadata. # adata_out.var['n_cellranger'] = np.array( # adata_out.layers['cellranger'].sum(axis=0) # ).squeeze() # adata_out.var['n_cellbender'] = np.array( # adata_out.layers['cellbender'].sum(axis=0) # ).squeeze() return adata_out
def read_10x_data(input_file, format_type='10x_h5', backed=None): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') adata.var_names_make_unique() return adata
def main(src_dir, out_dir, out_prefix, mito_prefix): sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() sc.settings.set_figure_params(dpi=80) sc.settings.figdir = out_dir + "/" adata = sc.read_10x_mtx(src_dir, var_names='gene_symbols', cache=True) adata.var_names_make_unique() adata2 = adata.copy() adata3 = adata.copy() adata4 = adata.copy() seurat_wf_plots(adata, out_dir, out_prefix, mito_prefix) recipe_seurat(adata2, out_dir, out_prefix) recipe_zheng17(adata3, out_dir, out_prefix) scanpy_qc(adata4, out_dir, out_prefix, mito_prefix)