def load_or_train_scvi_model(model_name=model_name, anndata_path=anndata_path): # Try loading model, if it doesn't exist train from scratch print('Trying to load or train model...') try: model = scvi.model.SCVI.load(model_name) print('Loaded model:', model_name) except: ### DEFINE AND TRAIN MODEL # these hyperparameters are fine for a small dataset, with a few batches # if integration is a problem then you can try increasing the layers to 3 # and hidden units to 256 print('Creating and training model:', model_name) adata = anndata.read_h5ad(anndata_path) print(adata) print('Restricting to genes with minimum counts of ', min_gene_counts) adata.var['gene_counts'] = np.squeeze(np.asarray(adata.X.sum(0))) adata = adata[:, adata.var.gene_counts > min_gene_counts] print(adata) ## register adata with SCVI, for more information see ## https://docs.scvi-tools.org/en/stable/api/reference/scvi.data.setup_anndata.html adata.layers["counts"] = adata.X.copy().tocsr() # converts to CSR format, preserve counts scvi.data.setup_anndata(adata, layer="counts", batch_key=batch_key) # typically you don't need to go tweak these parameters for training a model model = scvi.model.SCVI(adata, n_hidden=256, n_layers=2, gene_likelihood='nb', dispersion='gene-batch' ) # MODEL TRAINING # this model will train quickly even without a GPU, 25 epochs is not quite enough to # finish training, but this notebook is meant to run quickly just for showing the entire # data generation pipeline model.train(check_val_every_n_epoch=1, use_gpu=True, max_epochs=125, plan_kwargs={'lr': 1e-3}) train_test_results = model.history['elbo_train'] train_test_results['elbo_validation'] = model.history['elbo_validation'] ### MAKE SURE THE MODEL FINISHED TRAINING FOR BEST RESULTS print(train_test_results) model.save(model_name, save_anndata=True) # save the training results to a csv for inspection if needed train_test_results.to_csv(model_name + '+train_test_results.csv') return model
def populate(self): ad = anndata.read_h5ad( os.path.join(self.save_path, self.filenames[0]) ) # obs = cells, var = genes # extract GeneExpressionDataset relevant attributes # and provide access to annotations from the underlying AnnData object. ( X, batch_indices, labels, gene_names, cell_types, obs, obsm, var, _, uns, ) = extract_data_from_anndata( ad, batch_label=self.batch_label, ctype_label=self.ctype_label, class_label=self.class_label, use_raw=self.use_raw, ) # Dataset API takes a dict as input obs = obs.to_dict(orient="list") var = var.to_dict(orient="list") # add external cell measurements Ys = [] if self.cell_measurements_col_mappings_temp is not None: for name, attr_name in self.cell_measurements_col_mappings_temp.items(): columns = uns[attr_name] measurement = CellMeasurement( name=name, data=obsm[name], columns_attr_name=attr_name, columns=columns, ) Ys.append(measurement) self.populate_from_data( X=X, Ys=Ys, labels=labels, batch_indices=batch_indices, gene_names=gene_names, cell_types=cell_types, cell_attributes_dict=obs, gene_attributes_dict=var, ) self.filter_cells_by_count() del self.cell_measurements_col_mappings_temp
def read_data(filename,seed=1,nsample=3000000,dlevel='cell_ontology_class_reannotated',exclude_tissues=['marrow'], return_genes=False, DATA_DIR = '../../OnClass_data/'): name2co = get_ontology_name(DATA_DIR = DATA_DIR)[1] np.random.seed(seed) if 'facs' in filename: tech = 'facs' elif 'droplet' in filename: tech = 'droplet' else: tech = '' if not os.path.isfile(filename): sys.exit('%s not exist' % filename) x = read_h5ad(filename) ncell = np.shape(x.X)[0] dataset = x.X months = np.array(x.obs['age'].tolist()) labels = np.array(x.obs[dlevel].tolist()) tissues = np.array(x.obs['tissue'].tolist()) ind = [] for i in range(ncell): tis = tissues[i] lab = labels[i] if tis.lower() in exclude_tissues or lab.lower() not in name2co: #print ('%s %s' % (tis, lab)) continue ind.append(i) ind = np.array(ind) dataset = dataset[ind,:] months = months[ind] labels = labels[ind] tissues = tissues[ind] annot = [name2co[y.lower()] for y in labels] annot = np.array(annot) datanames = [] genes_list = {} labels = {} datasets = {} types = {} month_labels = {} uniq_age = np.unique(months) for m in uniq_age: dataname = tech+m datanames.append(dataname) index = np.array(months == m) datasets[dataname] = dataset[index,:] genes_list[dataname] = x.var.index labels[dataname] = annot[index] month_labels[dataname] = np.full(len(annot), len(index)) types[dataname] = Counter(np.array(annot)[index]) all_X, all_Y = extract_data(datanames, datasets, labels) if return_genes: return all_X, all_Y, genes_list else: return all_X, all_Y
def read_h5ad_st(cnt_pth: List[str], ) -> pd.DataFrame: """read spatial data from h5ad Parameters: ---------- cnt_pth : List[str] paths to spatial data (h5ad) files Returns: ------- Pandas DataFrame of a joint matrix. Data points from the same file will share the same rowname prefix k, in the joint matrix rowname given as: "k&-[x-coordinate]x[y-coordinate]" if coordinates are identified. Otherwise the rownames are given as: "k&-orignal-rowname" """ _cnts = list() for k, p in enumerate(cnt_pth): _data = ad.read_h5ad(p) if "x" in _data.obs.keys(): new_idx = [str(k) + "&-" + str(x)+"x"+str(y) for\ x,y in zip(_data.obs["x"].values, _data.obs['y'].values, )] elif "spatial" in _data.obsm.keys(): new_idx = [str(k) + "&-" + str(x)+"x"+str(y) for\ x,y in _data.obsm["spatial"]] else: new_idx = [str(k) + "&-" + str( x ) for\ x in _data.obs_names ] new_idx = pd.Index(new_idx) _data = pd.DataFrame( grab_anndata_counts(_data), index=new_idx, columns=_data.var_names, ) _cnts.append(_data) cnts = pd.concat(_cnts, join="outer") del _cnts, _data cnts[pd.isna(cnts)] = 0.0 cnts = cnts.astype(float) return cnts
def preprocess(self): print("Preprocessing dataset") ad = anndata.read_h5ad(self.save_path + self.download_name) # obs = cells, var = genes gene_names = np.array(ad.var.index.values, dtype=str) data = ad.X.toarray() select = data.sum(axis=1) > 0 # Take out cells that doesn't express any gene data = data[select, :] print("Finished preprocessing dataset") return data, gene_names
def test_anndata(self): adata = read_h5ad(join('data', 'test.h5ad')) w = AnnDataWrapper(adata, cell_set_obs=['CellType'], mappings_obsm=['X_umap'], mappings_obsm_names=['UMAP']) cells_creator = w.make_cells_file_def_creator('A', 0) cells = cells_creator( 'http://localhost:8000') self.assertEqual(cells, {'type': 'cells', 'fileType': 'anndata-cells.zarr', 'url': 'http://localhost:8000/A/0/anndata.zarr', 'options': { "mappings": { 'UMAP': { 'dims': [0, 1], 'key': 'obsm/X_umap' } } } }) cell_sets_creator = w.make_cell_sets_file_def_creator('A', 0) cell_sets = cell_sets_creator( 'http://localhost:8000') self.assertEqual(cell_sets, {'type': 'cell-sets', 'fileType': 'anndata-cell-sets.zarr', 'url': 'http://localhost:8000/A/0/anndata.zarr', 'options': [{'groupName': 'CellType', 'setName': 'obs/CellType'}]})
def plotGene(exFn, gene): prRes = anndata.read_h5ad(exFn) if gene in prRes.var.index: sc.pl.umap(prRes, color=[gene]) else: print( "Error! please check your input gene ID, it must be the same as in your expression file" ) print( "Also, the missing gene could be caused by the dispersion based gene filtering by the prerun program" )
def read_adata(adata_pth): adata = ad.read_h5ad(adata_pth) if hasattr(adata,"obsm") and\ "spatial" in adata.obsm.keys(): crd = adata.obsm["spatial"] elif "x" in adata.obs.keys() and\ "y" in adata.obs.keys(): crd = adata.obs[["x","y"]].values else: ValueError("No spatial coordinates found") return crd
def read_annotations(data): if len(data) == 1: ext = os.path.splitext(data[0])[-1] if ext == '.h5ad': res = anndata.read_h5ad( data[0])[:, :0].copy() #Slice to remove all gene observations return res else: return None elif len(data) == 3: return None
def load_palantir_data(smoothed=False): fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad' an = anndata.read_h5ad(fn) genes = an.var_names cells = an.obs_names if not smoothed: counts = singlet.CountsTable( data=an.raw.X.todense().T, index=genes, columns=cells, ) else: counts = singlet.CountsTable( data=an.obsm['MAGIC_imputed_data'].T, index=genes, columns=cells, ) ss = singlet.SampleSheet(an.obs) ss['tsne_1'] = an.obsm['tsne'][:, 0] ss['tsne_2'] = an.obsm['tsne'][:, 1] ss['clusters'] = ss['clusters'].astype(str) ds = singlet.Dataset( counts_table=counts, samplesheet=ss, ) ds.samplesheet['Cell Subtype'] = ds.samplesheet['clusters'].replace({ '0': 'HSC', '1': 'HSC', '2': 'Ery-precursor', '3': 'Mono', '4': 'Mono-precursor', '5': 'CLP', '6': 'Mono', '7': 'pDC', '8': 'Ery', '9': 'Mega', }) return ds
def extract_anndata_elements_from_file(self): logging.info( f"Reading in AnnData dataset: {path.basename(self.input_filename)}" ) self.anndata = anndata.read_h5ad(self.input_filename, backed="r" if self.backed else None) logging.info("Completed reading in AnnData dataset!") self.obs = self.transform_dataframe_index_into_column( self.anndata.obs, "obs", self.obs_index_column_name) self.var = self.transform_dataframe_index_into_column( self.anndata.var, "var", self.vars_index_column_name)
def test_read(tmpdir): from read_partial_registry import read pth = tmpdir / "test.h5ad" orig = gen_adata((10, 20)) orig.write(pth, compression="lzf") truth = ad.read_h5ad(pth) test = read(pth) assert_equal(truth, test) assert_equal(orig, test)
def load(tempdir, task, dataset, test=None, method=None, dependency="a related test"): """Load a cached h5ad file.""" data_path = _cache_name(tempdir, task, dataset, test=test, method=method) assert os.path.isfile( data_path), "Intermediate file missing. Did {} fail?".format( dependency) return anndata.read_h5ad(data_path)
def qc_checks(args): adata = anndata.read_h5ad(args.filtered_feature_matrix) adata.var_names_make_unique() qc_by_cell, qc_by_gene = sc.pp.calculate_qc_metrics(adata) # current directory is set up by the CWL runner qc_path = Path('qc_results.hdf5').absolute() print('Saving QC results to', qc_path) with pd.HDFStore(qc_path) as store: store['qc_by_cell'] = qc_by_cell store['qc_by_gene'] = qc_by_gene
def _set_file_parameters(self): import anndata try: adata = anndata.read_h5ad(self._file_name) self.n_rows, self.n_cols = adata.shape all_el = self.n_rows * self.n_cols if sp.issparse(adata.X): self.sparsity = (all_el - adata.X.tocsr().count_nonzero()) / all_el else: self.sparsity = (all_el - np.count_nonzero(adata.X)) / all_el except OSError: pass
def viral_enrichment_significance(data, clusters_col, save_str, perm_no): perm_file = save_str+"_viral_perm.h5ad" fdr_file = save_str + "_FDRs.csv" if not os.path.exists(perm_file): print("The file "+perm_file+" doesn't exist.") print("(A) Shuffling viral data...") # A. Shuffle the viral UMI counts across cells 100 times and compute the enrichment per cluster for each permutation cell_no = data.shape[0] data_perm = data.copy() data_perm.obs.reset_index(inplace=True) for i in range(0,perm_no): rand_indx = random.sample(range(0,cell_no), cell_no) data_perm.obs["ViralPerm_" + str(i)] = data_perm.obs.loc[rand_indx,"Viral+"].reset_index(drop=True) viral_enrichment_score(data_perm, clusters_col, viral_count_col="ViralPerm_" + str(i), suffix="_"+str(i)) data_perm.write_h5ad(perm_file) else: if not os.path.exists(fdr_file): print("(A) Reading existing viral-shuffeling file: "+perm_file) data_perm = an.read_h5ad(perm_file) # B. Distribution -> p-val if not os.path.exists(fdr_file): print("(B) Computing FDR from distribution of enrichment scores...") # Collecting enrichment score per permutation curr_cols = [clusters_col] col_name = "ViralEnrichment" curr_cols.extend([col for col in data_perm.obs.columns if re.search(col_name+"_", col)]) df_for_summary = data_perm.obs[curr_cols] enrich_df = pd.DataFrame(index=data_perm.obs[clusters_col].drop_duplicates().sort_values()) for i in range(0,perm_no): x = df_for_summary[["Enrichment_"+str(i), clusters_col]].drop_duplicates() enrich_df = enrich_df.merge(x, left_index=True, right_on=clusters_col).set_index(clusters_col) pd.DataFrame.transpose(enrich_df).hist(bins=15) # comparing real enrichments per cluster to permutations real_enrichments = data.obs[[clusters_col, "Enrichment_"]].drop_duplicates().set_index(clusters_col) FDR= {} for clust in set(data.obs[clusters_col]): FDR[clust] = (sum(float(real_enrichments.loc[clust]) <= enrich_df.loc[clust,:])+1) / (perm_no+1) FDR_df = pd.DataFrame.from_dict(FDR, orient="index").reset_index().rename(columns={"index":"clusters",0:"FDR"}) FDR_df.to_csv(save_str+"_FDRs.csv") else: print("(B) Reading existing FDR file: " + fdr_file) FDR_df = pd.read_csv(fdr_file, header=0, index_col=[0]) adj_FDR = fdrcorrection(FDR_df["FDR"]) FDR_df["adj_FDR"] = adj_FDR[1] FDR_df.to_csv(save_str + "_FDRs.csv") return FDR_df
def load_normalized_data(file_path, data_name='facs', flag_size_factor=True, total_ct_per_cell=1e4, flag_log1p=True): """load normalized data 1. Load filtered data for both FACS and droplet 2. Size factor normalization to counts per 1 million (total_ct_per_cell) 3. log(x+1) transform 4. Combine the data Args: file_path (str): file path. Should contain both FACS data facs_filtered.h5ad and droplet data droplet_filtered.h5ad Returns: adata_combine (AnnData): Combined data for FACS and droplet """ if data_name == 'facs': file_name = 'tabula-muris-senis-facs-official-raw-obj.h5ad' elif data_name == 'droplet': file_name = 'tabula-muris-senis-droplet-official-raw-obj.h5ad' elif data_name == 'facs_old': file_name = 'facs_filtered.h5ad' elif data_name == 'droplet_old': file_name = 'droplet_filtered.h5ad' else: return None # Load filtered data adata = read_h5ad(f'{file_path}/{file_name}') # Update annotations adata.obs['n_genes'] = (adata.X > 0).sum(axis=1) adata.obs['n_counts'] = (adata.X).sum(axis=1) adata.obs['age_num'] = [int(x.replace('m', '')) for x in adata.obs['age']] # Size factor normalization if flag_size_factor == True: sc.pp.normalize_per_cell(adata, counts_per_cell_after=total_ct_per_cell) # log(x+1) transform if flag_log1p == True: sc.pp.log1p(adata) # Filter data if 'facs' in data_name: ind_select = adata.obs['age'].isin(['3m', '18m', '24m']) adata = adata[ind_select, ] return adata
def test_backed_anndata_scvi(save_path): adata = scvi.data.synthetic_iid() path = os.path.join(save_path, "test_data.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") setup_anndata(adata, batch_key="batch") model = SCVI(adata, n_latent=5) model.train(1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], 5) model.get_elbo()
def test_to_memory_full(tmp_path, array_type): backed_pth = tmp_path / "backed.h5ad" mem_adata = gen_adata((15, 10), X_type=array_type) mem_adata.raw = gen_adata((15, 12), X_type=array_type) mem_adata.write_h5ad(backed_pth, compression="lzf") backed_adata = ad.read_h5ad(backed_pth, backed="r") assert_equal(mem_adata, backed_adata.to_memory()) # Test that raw can be removed del backed_adata.raw del mem_adata.raw assert_equal(mem_adata, backed_adata.to_memory())
def convert_pbmc3k(self, **kwargs): random_string = "".join( random.choice(string.ascii_letters) for _ in range(8)) data_locator = f"/tmp/test_{random_string}.cxg" self.fixtures.append(data_locator) source_h5ad = anndata.read_h5ad( f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad") write_cxg(adata=source_h5ad, container=data_locator, title="pbmc3k", **kwargs) config = app_config(data_locator) return CxgAdaptor(DataLocator(data_locator), config)
def read_h5ad(path): try: adata = anndata.read_h5ad(path) except: return "file_error" if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) name_genes(adata, inplace=True) return adata
def test_backed_anndata(save_path): adata = scvi.data.synthetic_iid() path = os.path.join(save_path, "test_data.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") adata_manager = generic_setup_adata_manager(adata, batch_key="batch") # test get item bd = AnnTorchDataset(adata_manager) bd[np.arange(adata.n_obs)] # sparse adata = scvi.data.synthetic_iid() adata.X = csr_matrix(adata.X) path = os.path.join(save_path, "test_data2.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") adata_manager = generic_setup_adata_manager(adata, batch_key="batch") # test get item bd = AnnTorchDataset(adata_manager) bd[np.arange(adata.n_obs)]
def annotate_file(filtered_file: Path, unfiltered_file: Path, token: str) -> Tuple[anndata.AnnData, anndata.AnnData]: # Get the directory data_set_dir = fspath(unfiltered_file.parent.stem) # And the tissue type tissue_type = get_tissue_type(data_set_dir, token) filtered_adata = anndata.read_h5ad(filtered_file) unfiltered_adata = anndata.read_h5ad(unfiltered_file) filtered_adata.obs['barcode'] = filtered_adata.obs.index filtered_adata.obs['dataset'] = data_set_dir filtered_adata.obs['organ'] = tissue_type filtered_adata.obs['modality'] = 'rna' cells = list(filtered_adata.obs.index) unfiltered_subset = unfiltered_adata[cells, :].copy() unfiltered_subset.obs = filtered_adata.obs unfiltered_subset.obsm = filtered_adata.obsm print(unfiltered_subset.obsm.keys()) return unfiltered_subset.copy(), filtered_adata
def get_train_test_data(self, data_file, split): file_object = anndata.read_h5ad(data_file) matrix = file_object.X num_cells_in_training = int(matrix.shape[1] * split) transpose = matrix.transpose() training_data = transpose[0:num_cells_in_training][:].transpose() test_data = transpose[num_cells_in_training:][:].transpose() print(f"Training data size : {training_data.shape}. Test data size : {test_data.shape}.") return training_data, test_data
def run(args): """UMAP """ # Parse options... options = args # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error # take options h5ad_fname = options.afile out_fname = options.ofile data_source = options.data_source n_neighbors = options.n_neighbors min_dist = options.min_dist if data_source != "X": if data_source == "lsi": data_source = "lsa" data_source = "X_" + data_source random_state = options.random_state # read h5ad adata = read_h5ad(h5ad_fname) # UMAP m = umap.UMAP(metric="euclidean", init="spectral", random_state=random_state, n_neighbors=n_neighbors, min_dist=min_dist, n_components=2).fit(adata.obsm[data_source]) adata.obsm['X_umap'] = m.embedding_ adata.uns['umap'] = { 'params': { 'metric': 'euclidean', 'init': 'spectral', 'random_state': random_state, 'n_neighbors': n_neighbors, 'min_dist': min_dist } } # write h5a adata.write_h5ad(filename=out_fname) return
def _transform_single_h5ad( adata_path, output_path, chrom_size_path, bin_size, step_size, window_size, compression, ): """ Resize non-overlap chrom bin count adata """ if (step_size % bin_size != 0) or (window_size % bin_size != 0): raise ValueError( "step_size and window_size need to be integral multiple of bin_size" ) n = step_size // bin_size m = window_size // bin_size adata = anndata.read_h5ad(adata_path) # somehow, I need to copy this out otherwise its super slow chrom_idx = adata.var["chrom"].values.copy() csc_data = adata.X.tocsc() chrom_dict = parse_chrom_size(chrom_size_path) chrom_data_list = [] for chrom in chrom_dict.keys(): chrom_csc_data = csc_data[:, chrom_idx == chrom] chunk_generator = ( ss.csc_matrix(chrom_csc_data[:, i : i + m].sum(axis=1)) for i in range(0, chrom_csc_data.shape[1], n) ) chrom_data = ss.hstack(list(chunk_generator)) chrom_data_list.append(chrom_data) total_data = ss.hstack(chrom_data_list) # TODO add all necessary info in adata.uns adata = anndata.AnnData( X=total_data, obs=adata.obs, var=generate_chrom_bin_bed_dataframe( chrom_size_path, window_size=window_size, step_size=step_size ), uns=dict( bin_size=window_size, step_size=step_size, chrom_size_path=chrom_size_path ), ) adata.write(filename=output_path, compression=compression) return output_path
def run(args): """Cluster """ # Parse options... options = args # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error # take options h5ad_fname = options.afile out_fname = options.ofile method = options.method data_source = options.data_source n_neighbors = options.n_neighbors if data_source != "X": if data_source == "lsi": data_source = "lsa" data_source = "X_" + data_source random_state = options.random_state dbscan_min_samples = options.min_samples dbscan_min_cluster_size = options.min_cluster_size # read h5ad adata = read_h5ad(h5ad_fname) # clustering if method == "louvain": adata = louvain_clustering(adata, data_source, n_neighbors=n_neighbors) elif method == "leiden": adata = leiden_clustering(adata, data_source, n_neighbors=n_neighbors) elif method == "dbscan": adata = dbscan_clustering( adata, data_source, dbscan_min_samples=dbscan_min_samples, dbscan_min_cluster_size=dbscan_min_cluster_size) elif method == "spectral": adata = spectral_clustering(adata, data_source, n_neighbors=n_neighbors) # write h5a adata.write_h5ad(filename=out_fname) return
def main(input_dir, output_dir): output_dir.mkdir(exist_ok=True) for h5ad_file in ["secondary_analysis.h5ad", "scvelo_annotated.h5ad"]: adata = read_h5ad(input_dir / h5ad_file) if "rank_genes_groups" in adata.uns: # Handle marker genes by putting top n per cluster in `obs` for `factors` visualization. marker_genes = [] for i in range(NUM_MARKER_GENES_TO_VISUALIZE): adata.obs[f"marker_gene_{str(i)}"] = [ "" for v in adata.obs.index ] for cluster in adata.obs["leiden"]: marker_gene = adata.uns["rank_genes_groups"]["names"][i][ cluster] adata.obs[f"marker_gene_{str(i)}"][adata.obs["leiden"] == cluster] = marker_gene marker_genes.append(marker_gene) adata.var["marker_genes_for_heatmap"] = [ gene in marker_genes for gene in adata.var.index ] if "dispersions_norm" in adata.var: top_dispersion = adata.var["dispersions_norm"][sorted( range(len(adata.var["dispersions_norm"])), key=lambda k: adata.var["dispersions_norm"][k], )[-len(adata.obs['leiden'].unique()) * NUM_MARKER_GENES_TO_VISUALIZE:][0]] adata.var["top_highly_variable"] = (adata.var["dispersions_norm"] > top_dispersion) for layer in adata.layers: if isinstance(adata.layers[layer], sparse.spmatrix): adata.layers[layer] = adata.layers[layer].tocsc() # All data from secondary_analysis is scaled at the moment to zero-mean unit-variance # https://github.com/hubmapconsortium/salmon-rnaseq/blob/master/bin/analysis/scanpy_entry_point.py#L47 # We currently cannot visaulize this in Vitessce so we replace `X` with the log-normalized raw counts: # https://github.com/hubmapconsortium/salmon-rnaseq/commit/9cf1dd4dbe4538b565a0355f56399d3587827eff # Ideally, we should be able to manage the `layers` and `X` simultaneously in `zarr` but currently we cannot: # https://github.com/theislab/anndata/issues/524 if (SECONDARY_ANALYSIS == h5ad_file): adata.layers['scaled'] = adata.X.copy() adata.X = adata.layers['unscaled'].copy() zarr_path = output_dir / (Path(h5ad_file).stem + ".zarr") # If the matrix is sparse, it's best for performance to # use non-sparse formats to keep the portal responsive. # In the future, we should be able to use CSC sparse data natively # and get equal performance: # https://github.com/theislab/anndata/issues/524 if isinstance(adata.X, sparse.spmatrix): adata.X = adata.X.todense() adata.write_zarr(zarr_path, [adata.shape[0], VAR_CHUNK_SIZE])
def __init__(self, data_path, num_gene, shared_gene_mask=None, filter_mask=None, normalized=True): self.data_path = data_path self.num_gene = num_gene self.gene_mask = shared_gene_mask self.filter_mask = filter_mask self.normalized = normalized self.anndata = anndata.read_h5ad(data_path) self.num_class = len(self.anndata.obs['labels'].unique().tolist()) if self.filter_mask is None: self.filter_mask, _ = sc.pp.filter_genes(self.anndata, min_counts=1, inplace=False) else: print("use share filter") self.anndata = self.anndata[:, self.filter_mask] if self.gene_mask is None: self.gene_mask = self.select_gene(data=self.anndata.X, num_gene=self.num_gene) else: print("use share gene") if self.normalized: anndata_norm = self.anndata.copy() sc.pp.normalize_per_cell(anndata_norm, counts_per_cell_after=1_000_000) sc.pp.log1p(anndata_norm) anndata_norm.X = anndata_norm.X.toarray() anndata_norm.X -= anndata_norm.X.mean(axis=0) anndata_norm.X /= anndata_norm.X.std(axis=0) if np.isnan(anndata_norm.X).any(): print("Detect nan, fix by nan to num") anndata_norm.X = np.nan_to_num(anndata_norm.X) assert (not np.isnan(anndata_norm.X).any()) self.anndata_preprocessed = anndata_norm[:, self.gene_mask].copy() else: self.anndata_preprocessed = self.anndata[:, self.gene_mask].copy() self.X = torch.tensor(self.anndata_preprocessed.X) self.id_to_batch, self.batch_to_id = self.get_batch_map() self.id_to_cell, self.cell_to_id = self.get_cell_map() self.cell_label_tensor = torch.tensor( [self.cell_to_id[e] for e in self.anndata.obs['labels']]) self.batch_id_tensor = torch.tensor( [self.batch_to_id[e] for e in self.anndata.obs['batch_id']])
def test_load_to_h5ad(self): with tempfile.TemporaryDirectory() as tmpdir: dname = os.path.join(tmpdir, "dense.h5ad") sname = os.path.join(tmpdir, "sparse.h5ad") self.workflow.output_dir = tmpdir self.workflow.load_data_and_save_h5ad("dense.h5ad") data = ad.read_h5ad(dname) npt.assert_array_almost_equal_nulp(data.X, self.workflow.data.values) os.remove(dname) self.workflow.load_data_and_save_h5ad("sparse.h5ad", to_sparse=True) data = ad.read_h5ad(sname) self.assertTrue(sps.isspmatrix_csr(data.X)) npt.assert_array_almost_equal_nulp(data.X.A, self.workflow.data.values.A) os.remove(sname)