def get(self, dataset_id): analysis_id = request.args.get('analysis') session_id = request.cookies.get('gear_session_id') user = geardb.get_user_from_session_id(session_id) if analysis_id: ana = geardb.Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id) ana.discover_type() adata = sc.read_h5ad(ana.dataset_path()) else: ds = geardb.Dataset(id=dataset_id, has_h5ad=1) h5_path = ds.get_file_path() # Let's not fail if the file isn't there if not os.path.exists(h5_path): return { "success": -1, "message": "No h5 file found for this dataset" } adata = sc.read_h5ad(h5_path) return {"success": 1, "gene_symbols": adata.var.gene_symbol.tolist()}
def test_fixup_gene_symbols_seurat(self): if not os.path.isfile(self.seurat_path): return unittest.skip( "Skipping gene symbol conversion tests because test h5ads are not present. To create them, " "run local_server/test/fixtures/schema_test_data/generate_test_data.sh" ) original_adata = sc.read_h5ad(self.seurat_path) merged_adata = sc.read_h5ad(self.seurat_merged_path) fixup_config = {"X": "log1p", "counts": "raw", "scale.data": "log1p"} fixed_adata = remix.fixup_gene_symbols(original_adata, fixup_config) self.assertEqual( merged_adata.layers["counts"][:, merged_adata.var.index == self.stable_gene].sum(), fixed_adata.raw.X[:, fixed_adata.var.index == self.stable_gene].sum() ) self.assertAlmostEqual( merged_adata.X[:, merged_adata.var.index == self.stable_gene].sum(), fixed_adata.X[:, fixed_adata.var.index == self.stable_gene].sum() ) self.assertAlmostEqual( merged_adata.layers["scale.data"][:, merged_adata.var.index == self.stable_gene].sum(), fixed_adata.layers["scale.data"][:, fixed_adata.var.index == self.stable_gene].sum() )
def get_train_dataloaders(train, val, batch_size=64, label_col='cell_type'): """ Get train and validation dataloaders. Arguments --------- train: str or AnnData - AnnData object or filepath of saved AnnData with .h5ad ext to be used for training val: str or AnnData - AnnData object or filepath of saved AnnData with .h5ad ext to be used for validation batch_size: int - batch size for dataloaders """ train_adata = sc.read_h5ad(train) if isinstance(train, str) else train val_adata = sc.read_h5ad(val) if isinstance(val, str) else val train_adata = normalize(train_adata) val_adata = normalize(val_adata, var_order=train_adata.var.index.to_list()) train_ds = PollockDataset(train_adata, label_col=label_col) val_ds = PollockDataset(val_adata, label_col=label_col) train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False) return train_dl, val_dl
def test_fixup_gene_symbols_sctransform(self): if not os.path.isfile(self.sctransform_path): return unittest.skip( "Skipping gene symbol conversion tests because test h5ads are not present. To create them, " "run local_server/test/fixtures/schema_test_data/generate_test_data.sh" ) original_adata = sc.read_h5ad(self.sctransform_path) merged_adata = sc.read_h5ad(self.sctransform_merged_path) fixup_config = {"X": "log1p", "counts": "raw"} fixed_adata = remix.fixup_gene_symbols(original_adata, fixup_config) # sctransform does a bunch of stuff, including slightly modifying the # raw counts. So we can't assert for exact equality the way we do with # the vanilla seurat tutorial. But, the results should still be very # close. merged_raw_stable = merged_adata.layers["counts"][:, merged_adata.var.index == self.stable_gene].sum() fixed_raw_stable = fixed_adata.raw.X[:, fixed_adata.var.index == self.stable_gene].sum() self.assertLess(abs(merged_raw_stable - fixed_raw_stable), .001 * merged_raw_stable) self.assertAlmostEqual( merged_adata.X[:, merged_adata.var.index == self.stable_gene].sum(), fixed_adata.X[:, fixed_adata.var.index == self.stable_gene].sum(), 0 )
def load_file(path): """ Load single cell dataset from file Parameters ---------- path the path store the file Return ------ AnnData """ if os.path.exists(DATA_PATH + path + '.h5ad'): adata = sc.read_h5ad(DATA_PATH + path + '.h5ad') elif os.path.isdir(path): # mtx format adata = read_mtx(path) elif os.path.isfile(path): if path.endswith(('.csv', '.csv.gz')): adata = sc.read_csv(path).T elif path.endswith(('.txt', '.txt.gz', '.tsv', '.tsv.gz')): df = pd.read_csv(path, sep='\t', index_col=0).T adata = AnnData(df.values, dict(obs_names=df.index.values), dict(var_names=df.columns.values)) elif path.endswith('.h5ad'): adata = sc.read_h5ad(path) else: raise ValueError("File {} not exists".format(path)) if not issparse(adata.X): adata.X = scipy.sparse.csr_matrix(adata.X) adata.var_names_make_unique() return adata
def load_PB_datasets(path='data'): #Import datasets '/'.join([path, 'droplet_pseudobulk.h5ad']) droplet = scanpy.read_h5ad('/'.join([path, 'droplet_pseudobulk.h5ad'])) facs = scanpy.read_h5ad('/'.join([path, 'facs_pseudobulk.h5ad'])) mca = scanpy.read_h5ad('/'.join([path, 'mca_pseudobulk.h5ad'])) return droplet, facs, mca
def test_read_10x(tmp_path, mtx_path, h5_path, prefix): if prefix is not None: # Build files named "prefix_XXX.xxx" in a temporary directory. mtx_path_orig = mtx_path mtx_path = tmp_path / "filtered_gene_bc_matrices_prefix" mtx_path.mkdir() for item in mtx_path_orig.iterdir(): if item.is_file(): shutil.copyfile(item, mtx_path / f"{prefix}{item.name}") mtx = sc.read_10x_mtx(mtx_path, var_names="gene_symbols", prefix=prefix) h5 = sc.read_10x_h5(h5_path) # Drop genome column for comparing v3 if "3.0.0" in str(h5_path): h5.var.drop(columns="genome", inplace=True) # Check equivalence assert_anndata_equal(mtx, h5) # Test that it can be written: from_mtx_pth = tmp_path / "from_mtx.h5ad" from_h5_pth = tmp_path / "from_h5.h5ad" mtx.write(from_mtx_pth) h5.write(from_h5_pth) assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
def read_species(human=True): """ Returns either the human or mouse anndata object human = True if you want the human object, false will give you the mouse object """ if human == True: return (sc.read_h5ad("DataBySpecies/human.anndata.h5ad")) else: return (sc.read_h5ad("DataBySpecies/mouse.anndata.h5ad"))
def test_diversity_rarefaction2(create_testfolder): f = create_testfolder / "test.h5ad" adata = sc.read_h5ad(f) ddl.tl.clone_rarefaction(adata, groupby='sample_id', clone_key='clone_id') assert 'diversity' in adata.uns p = ddl.pl.clone_rarefaction(adata, color='sample_id') assert p is not None adata = sc.read_h5ad(f) p = ddl.pl.clone_rarefaction(adata, color='sample_id') assert p is not None
def __init__(self, original, result): self.adata = sc.read_h5ad(original, True) self.adata_out = sc.read_h5ad(result, True) sum_list = self.adata.obsm["X_umap"][:, 0] - self.adata_out.obsm[ "X_umap_2d"][:, 0] diff_sum = 0 for i in sum_list: diff_sum += i assert 0.5 > diff_sum > -0.5, "difference too large"
def load_demo_data(): # load the data stream = pkg_resources.resource_stream(__name__, 'data/RNA_demo_github.h5ad') rna_data = sc.read_h5ad(stream) stream = pkg_resources.resource_stream(__name__, 'data/ACC_demo_github.h5ad') acc_data = sc.read_h5ad(stream) stream = pkg_resources.resource_stream(__name__, 'data/GLUER_demo_github.h5ad') gluer_data = sc.read_h5ad(stream) return rna_data, acc_data, gluer_data
def load_data(data_name, data_dir=None, dtype=float, load_metadata=True): import scanpy as sc import pandas as pd metadata = None if data_name.lower() == 'mnist': if data_dir is None: data_dir = "./data/" data_path = os.path.join(data_dir, "mnist2500_X.txt") X = np.loadtxt(data_path).astype(dtype) if load_metadata: metadata_path = os.path.join(data_dir, "mnist2500_labels.txt") metadata = np.loadtxt(metadata_path).astype(int) metadata = pd.DataFrame(metadata, columns=['label']) elif data_name.lower() in tabula_muris_tissues: if data_dir is None: data_dir = "./data/tabula-muris/04_facs_processed_data/FACS/" data_file = f"Processed_{data_name.title()}.h5ad" data_path = os.path.join(data_dir, data_file) X = sc.read_h5ad(data_path) metadata = X.obs.copy() X = X.obsm['X_pca'] elif data_name.lower() == "ATAC": if data_dir is None: data_dir = "./data/10kPBMC_scATAC/02_processed_data/" data_file = f"atac_pbmc_10k_nextgem_preprocessed_data.h5ad" data_path = os.path.join(data_dir, data_file) X = sc.read_h5ad(data_path) metadata = X.obs.copy() X = X.obsm['lsi'] if load_metadata: return X, metadata else: return X
def get(self, dataset_id): args = request.args analysis_id = args.get('analysis_id') session_id = request.cookies.get('gear_session_id') if analysis_id: # session_id = request.cookies.get('gear_session_id') user = geardb.get_user_from_session_id(session_id) ana = geardb.Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id) ana.discover_type() adata = sc.read_h5ad(ana.dataset_path()) else: ds = geardb.Dataset(id=dataset_id, has_h5ad=1) h5_path = ds.get_file_path() # Let's not fail if the file isn't there if not os.path.exists(h5_path): return { "success": -1, "message": "No h5 file found for this dataset" } adata = sc.read_h5ad(h5_path) columns = adata.obs.columns.tolist() if hasattr(adata, 'obsm') and hasattr(adata.obsm, 'X_tsne'): columns.append('X_tsne_1') columns.append('X_tsne_2') if 'replicate' in columns: columns.remove('replicate') if 'time_point_order' in columns: columns.remove('time_point_order') # get a map of all levels for each column levels = {} for col in columns: try: levels[col] = adata.obs[col].cat.categories.tolist() except: pass # If levels are not categorical I don't believe # we need to return levels return { "success": 1, "obs_columns": columns, "obs_levels": levels }
def oligodendroglioma() -> AnnData: """The original inferCNV example dataset. Derived from :cite:`Tirosh2016`. """ with pkg_resources.path(data, "oligodendroglioma.h5ad") as p: return sc.read_h5ad(p)
def result_export(request): """ create a new dataset from selected indexes of observations """ pid = request.POST.get("pid", None) if not pid: return HttpResponseBadRequest adata = read_h5ad( os.path.join(USER_PROCESS_FOLDER, str(pid), "results.h5ad")) hextime = hex(int(time()))[2:] output_path = os.path.join(DATASET_FOLDER, f"exported_{pid}_{hextime}.h5ad") indexes = np.fromstring(request.POST.get("index"), dtype=int, sep=",") adata = adata[indexes, :] adata.write(output_path) saved_file = DataSet(name=request.POST.get("name", f"export_{pid}"), path=output_path, description=request.POST.get("description", ""), n_obs=adata.n_obs, n_vars=adata.n_vars, attrs=json.dumps(get_anndata_attrs(adata))) saved_file.save() return JsonResponse({'status': True, 'id': saved_file.id})
def main(): parser = argparse.ArgumentParser( description='Rename a colum in an H5AD file') parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-pre', '--previous_column', type=str, required=True, help='Name of column which needs changing') parser.add_argument('-post', '--post_column', type=str, required=True, help='New name for column') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() adata = sc.read_h5ad(args.input_file) adata.obs.rename(columns={args.previous_column: args.post_column}, inplace=True) adata.write(args.output_file)
def load_h5ad_file(self, input_path, batch_size, datasets=[]): """ Load input data from a h5ad file and divide into training and test set :param input_path: path to h5ad file :param batch_size: batch size to use for training :param datasets: a list of datasets to extract from the file :return: Dataset object """ raw_input = sc.read_h5ad(input_path) # Subset dataset if len(datasets) > 0: all_ds = collections.Counter(raw_input.obs['ds']) for ds in all_ds: if ds not in datasets: raw_input = raw_input[raw_input.obs['ds'] != ds].copy() # Create training dataset ratios = [raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']] self.x_data = raw_input.X.astype(np.float32) self.y_data = np.array(ratios, dtype=np.float32).transpose() # create placeholders self.x_data_ph = tf.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph") self.y_data_ph = tf.placeholder(self.y_data.dtype, self.y_data.shape, name="y_data_ph") self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph)) self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size) # Extract celltype and feature info self.labels = raw_input.uns['cell_types'] self.sig_genes = list(raw_input.var_names)
def _cached_dataset(self): if self._ignore_cache: return None if not self._cache_path.exists(): return None dataset = sc.read_h5ad(self._cache_path) return dataset
def test_plot_network(create_testfolder): f = create_testfolder / "test.h5ad" adata = sc.read_h5ad(f) ddl.pl.clone_network(adata, color=['isotype'], show=False, return_fig=False)
def read_10x_data(input_file, format_type='10x_h5', backed=None, transpose=False, sparse=False): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) elif format_type == "10x_txt": adata = sc.read_csv(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') adata.var_names_make_unique() adata.obs_names_make_unique() return adata
def read_adata( gex_data, # filename gex_data_type # string describing file type ): ''' Split this out so that other code can use it. Read GEX data ''' print('reading:', gex_data, 'of type', gex_data_type) if gex_data_type == 'h5ad': adata = sc.read_h5ad( gex_data ) elif gex_data_type == '10x_mtx': adata = sc.read_10x_mtx( gex_data ) elif gex_data_type == '10x_h5': adata = sc.read_10x_h5( gex_data, gex_only=True ) elif gex_data_type == 'loom': adata = sc.read_loom( gex_data ) else: print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']") exit() if adata.isview: # this is so weird adata = adata.copy() return adata
def main(ds, strategy): adata = sc.read_h5ad(ds + '.h5ad') if adata.raw: raw_adata = ad.AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs) var, to_keep, counts = curate_var(raw_adata.var, strategy) counts['dataset'] = ds + '-raw' counts['mapping_file'] = strategy report(counts) raw_adata.var = var raw_adata = raw_adata[:, to_keep] raw_adata.var.set_index('feature_id', inplace=True) adata.raw = raw_adata del raw_adata gc.collect() var, to_keep, counts = curate_var(adata.var, strategy) var['feature_is_filtered'] = False # feature_is_filtered is default False counts['dataset'] = ds + '-X' counts['mapping_file'] = strategy report(counts) adata.var = var adata = adata[:, to_keep] adata.var.set_index('feature_id', inplace=True) # write the new object to the file adata.write(filename=ds + '.h5ad', compression='gzip') del adata gc.collect()
def get_all_gene(trainset_dir): dataset_list = os.listdir(trainset_dir) gene = [] for dataset in dataset_list: if '.txt' in dataset: df = pd.read_csv(trainset_dir + dataset, sep='\t') elif '.csv' in dataset: df = pd.read_csv(trainset_dir + dataset) elif '.h5' in dataset and '.h5ad' not in dataset and '_processed' not in dataset: df = pd.read_hdf(trainset_dir + dataset) elif '.h5ad' in dataset: df = sc.read_h5ad(trainset_dir + dataset) df = df.to_df() else: continue file_gene = df.columns.tolist() for i in file_gene: if i == 'cell_label': continue if i not in gene: gene.append(i) with open('pre_trained/gene/new_model.txt', 'w') as gene_file: gene_ = '' for i in gene[:-2]: gene_ = gene_ + i + ', ' gene_ = gene_ + gene[-2] gene_ = gene_.lower() gene_file.write(gene_) gene = gene_.split(', ') gene.append('cell_label') return gene
def preprocess(args): sc.logging.print_versions() # # read filtered data from a loom file # adata = sc.read_loom(args.loom_filtered) adata = sc.read_h5ad(args.anndata) # Total-count normalize (library-size correct) to 10,000 reads/cell sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) # log transform the data. sc.pp.log1p(adata) # identify highly variable genes. sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) sc.pl.highly_variable_genes(adata) # keep only highly variable genes: adata = adata[:, adata.var['highly_variable']] # regress out total counts per cell and the percentage of mitochondrial genes expressed sc.pp.regress_out(adata, ['n_counts', 'percent_mito'], n_jobs=args.threads) # scale each gene to unit variance, clip values exceeding SD 10. sc.pp.scale(adata, max_value=10) adata.write(args.anndata)
def test_workflow(adata_path, upgrade_schema, obs_expected, tmp_path): if upgrade_schema: adata = sc.read_h5ad(adata_path) ir.io.upgrade_schema(adata) else: adata = ir.io.read_10x_vdj(adata_path) adata_obs_expected = pd.read_pickle(obs_expected) ir.tl.chain_qc(adata) ir.pp.ir_dist(adata) ir.tl.define_clonotypes(adata) ir.tl.clonotype_network(adata) ir.tl.clonal_expansion(adata) ir.pl.clonotype_network(adata) # test that writing works (i.e. all scirpy fields can be serialized) adata.write_h5ad(tmp_path / "adata.h5ad") # turn nans into consistent value (nan) _normalize_df_types(adata.obs) # # Use this code to re-generate the "expected file", if necessary. # adata.obs.to_pickle(obs_expected, protocol=4) pdt.assert_frame_equal(adata.obs, adata_obs_expected, check_dtype=False, check_categorical=False)
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Get a list of all celltypes in an anndataframe. """) parser.add_argument('-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file.') parser.add_argument( '--cell_label', action='store', dest='cell_label', default='cluster', help='Anndata cell type label name in obs slot. (default: %(default)s)' ) options = parser.parse_args() # Load the AnnData file adata = sc.read_h5ad(filename=options.h5) np_array = np.sort(adata.obs[options.cell_label].unique().astype(str)) np.savetxt('cell_labels.csv', np_array, fmt='%s')
def test_apply_bad_schema(self, mock_get_ontology_label): mock_get_ontology_label.return_value = "test label" remix.apply_schema(self.source_h5ad_path, self.bad_config_path, self.output_h5ad_path) new_adata = sc.read_h5ad(self.output_h5ad_path) # Should refuse to write the version self.assertNotIn("version", new_adata.uns_keys())
def load_data(data): if isfile(data): name, extension = splitext(data) if extension == ".h5ad": adata = sc.read_h5ad(data) elif extension == ".loom": adata = sc.read_loom(data) else: raise click.FileError(data, hint="does not have a valid extension [.h5ad | .loom]") elif isdir(data): if not data.endswith(sep): data += sep adata = sc.read_10x_mtx(data) else: raise click.FileError(data, hint="not a valid file or path") if not set_obs_names == "": if set_obs_names not in adata.obs_keys(): raise click.UsageError(f"obs {set_obs_names} not found, options are: {adata.obs_keys()}") adata.obs_names = adata.obs[set_obs_names] if not set_var_names == "": if set_var_names not in adata.var_keys(): raise click.UsageError(f"var {set_var_names} not found, options are: {adata.var_keys()}") adata.var_names = adata.var[set_var_names] if make_obs_names_unique: adata.obs.index = make_index_unique(adata.obs.index) if make_var_names_unique: adata.var.index = make_index_unique(adata.var.index) if not adata._obs.index.is_unique: click.echo("Warning: obs index is not unique") if not adata._var.index.is_unique: click.echo("Warning: var index is not unique") return adata
def test_overlap1(create_testfolder): f = create_testfolder / "test.h5ad" adata = sc.read_h5ad(f) ddl.tl.clone_overlap(adata, groupby='group3', colorby='group2') assert 'clone_overlap' in adata.uns assert isinstance(adata.uns['clone_overlap'], pd.DataFrame) ddl.pl.clone_overlap(adata, groupby='group3', colorby='group2')
def render_data(request): id_ = int(request.GET.get('id', None)) if id_ is None: return HttpResponseBadRequest worker = get_object_or_404(WorkerRecord, id=int(id_)) dataset = get_object_or_404(DataSet, name=f"Worker_{id_}") dataset.attrs = json.loads(dataset.attrs) path = os.path.join(USER_PROCESS_FOLDER, str(worker.id), 'results.h5ad') annData = read_h5ad(path) return render( request, "process/data.html", { 'worker': worker, 'vars': annData.var.reset_index().to_html( index=False, classes='mb-0 table table-bordered', max_rows=10000), 'obs': annData.obs.reset_index().to_html( index=False, classes='mb-0 table table-bordered', max_rows=10000), 'dataset': dataset })