def load_data(data): if isfile(data): name, extension = splitext(data) if extension == ".h5ad": adata = sc.read_h5ad(data) elif extension == ".loom": adata = sc.read_loom(data) else: raise click.FileError(data, hint="does not have a valid extension [.h5ad | .loom]") elif isdir(data): if not data.endswith(sep): data += sep adata = sc.read_10x_mtx(data) else: raise click.FileError(data, hint="not a valid file or path") if not set_obs_names == "": if set_obs_names not in adata.obs_keys(): raise click.UsageError(f"obs {set_obs_names} not found, options are: {adata.obs_keys()}") adata.obs_names = adata.obs[set_obs_names] if not set_var_names == "": if set_var_names not in adata.var_keys(): raise click.UsageError(f"var {set_var_names} not found, options are: {adata.var_keys()}") adata.var_names = adata.var[set_var_names] if make_obs_names_unique: adata.obs.index = make_index_unique(adata.obs.index) if make_var_names_unique: adata.var.index = make_index_unique(adata.var.index) if not adata._obs.index.is_unique: click.echo("Warning: obs index is not unique") if not adata._var.index.is_unique: click.echo("Warning: var index is not unique") return adata
def loom2h5ad(loomFile, h5adFile): adata = sc.read_loom(loomFile, sparse=True, X_name='spliced', obs_names='cell', var_names='geneID', dtype='float32') adata.var.set_index(['Gene'], inplace=True) adata.obs['CellID'] = adata.obs['CellID'].apply( lambda x: x[1:].replace('_', '-')) adata.obs.set_index(['CellID'], inplace=True) adata.obs.rename(columns={ 'seurat_clusters': 'louvain', 'nFeature_RNA': 'n_genes', 'nCount_RNA': 'n_counts', 'coor_x': 'x', 'coor_y': 'y' }, inplace=True) adata.obsm['X_pca'] = adata.obsm.pop("pca_cell_embeddings") adata.obsm['X_umap'] = adata.obsm.pop("umap_cell_embeddings") adata.varm['PCs'] = adata.varm.pop("pca_feature_loadings") del adata.layers['scale_data'] rawdata = anndata.AnnData(X=adata.X, obs=adata.obs[['coor_x', 'coor_y']], var=adata.var[['Selected']]) adata.raw = rawdata adata.layers['raw_data'] = adata.X adata.X = adata.layers['norm_data'] del adata.layers['norm_data'] adata.write(h5adFile)
def read_adata( gex_data, # filename gex_data_type # string describing file type ): ''' Split this out so that other code can use it. Read GEX data ''' print('reading:', gex_data, 'of type', gex_data_type) if gex_data_type == 'h5ad': adata = sc.read_h5ad( gex_data ) elif gex_data_type == '10x_mtx': adata = sc.read_10x_mtx( gex_data ) elif gex_data_type == '10x_h5': adata = sc.read_10x_h5( gex_data, gex_only=True ) elif gex_data_type == 'loom': adata = sc.read_loom( gex_data ) else: print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']") exit() if adata.isview: # this is so weird adata = adata.copy() return adata
def import_scv_loom(loom): print('Importing loom file...') adata = sc.read_loom(loom, sparse=True, X_name='spliced') print("Column attributes:", adata.obs_keys()) print("Row attributes:", adata.var_keys()) print('Shape: ', adata.shape) return adata
def read_velocyto_loom(fn, args, **kw): data = sc.read_loom(fn, var_names='Accession') data.var.rename(columns={'Gene': 'gene_symbols'}, inplace=True) sample_id = os.path.splitext(os.path.basename(fn))[0] data.obs['sample_id'] = sample_id data.obs['sample_id'] = data.obs['sample_id'].astype('category') sv.utils.clean_obs_names(data) data.obs_names = [i + '-' + sample_id for i in data.obs_names] data.var.index.name = 'gene_ids' return data
def _read_obj(input_obj, input_format='anndata', **kwargs): if input_format == 'anndata': adata = sc.read(input_obj, **kwargs) elif input_format == 'loom': adata = sc.read_loom(input_obj, **kwargs) else: raise NotImplementedError( 'Unsupported input format: {}'.format(input_format)) adata.var = _fix_booleans(adata.var) adata.obs = _fix_booleans(adata.obs) return adata
def _load(self): if self.input_format == 'h5ad': return sc.read_h5ad(self.input_filename) elif self.input_format == 'loom': return sc.read_loom(self.input_filename) elif self.input_format == '10x': return self._load_10x() elif self.input_format == 'mtx' or self.input_format == 'mex': return sc.read_mtx(self.input_filename) elif self.input_format == 'bustools-count': return self._load_bustools_count() return None
def fitting_gamma(adata, path): bdata = scp.read_loom(path["path"], sparse=True, X_name='spliced') bdata = bdata[adata.obs.index, ] bdata.obs['leiden'] = adata.obs['leiden'] bdata.write_loom(path['loom']) vlm = vcy.VelocytoLoom(path["loom"]) # gene filtering vlm.score_detection_levels(min_cells_express=20, min_expr_counts=50) vlm.filter_genes(by_detection_levels=True) vlm.score_detection_levels(min_expr_counts=0, min_cells_express=0, min_expr_counts_U=25, min_cells_express_U=20) vlm.filter_genes(by_detection_levels=True) # print(vlm.S.shape) # size normalization vlm._normalize_S(relative_size=vlm.initial_cell_size, target_size=np.median(vlm.initial_cell_size)) vlm._normalize_U(relative_size=vlm.initial_Ucell_size, target_size=np.median(vlm.initial_Ucell_size)) # PCA vlm.perform_PCA() pcn = vlm.S.shape[0] # plt.plot(np.cumsum(vlm.pca.explained_variance_ratio_)[:100]) n_comps = np.where( np.diff(np.diff(np.cumsum(vlm.pca.explained_variance_ratio_)) > 0.002) )[0][0] plt.axvline(n_comps, c="k") print("n_comps", n_comps) # KNN smoothing k = int(vlm.S.shape[1] * 0.025) print("k", k) vlm.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k * 8, b_maxl=k * 4, n_jobs=16) # fit gamma vlm.fit_gammas() # estimate velocity vlm.predict_U() vlm.calculate_velocity() vlm.calculate_shift(assumption="constant_velocity") vlm.extrapolate_cell_at_t(delta_t=1., ) print("velocity Done!") return vlm
def loadData(SJout, loom_data, gene_counts, changeo_db): ab_tx, switch_tx = loadSJoutIGH(SJout) ### # Assemblies (Changeo) changeo_db_H = loadChangeoDbH(changeo_db) ### # Gene counts ### print("loading anndata") adata = 'placeholder' loom_adata = sc.read_loom(loom_data) adata = sc.read_h5ad(gene_counts) return ab_tx, switch_tx, adata, loom_adata, changeo_db_H
def getdata(path): """Load data from file path Parameters: ---------- path: str absolute file path (.loom). Returns: ------- adata: scanpy.adata scanpy adata object """ adata = sc.read_loom(path, var_names='var_names', obs_names='obs_names') return (adata)
def _load_loom(self, markers): if self.args.indir.endswith(".loom"): loom_file = self.args.indir else: loom_file = os.path.join(self.args.indir, "data.loom") if not os.path.isfile(loom_file): raise ScelVisException("cannot find loom file at %s" % loom_file) logger.info("Reading data from %s", loom_file) ad = sc.read_loom( loom_file, X_name="spliced" if self.args.use_raw else "norm_data") for layer in list(ad.layers.keys()): logger.info("Removing unused layer %s" % layer) del ad.layers[layer] for col in markers.columns: ad.uns["marker_" + col] = markers[col].values return ad
def renameV(self, loom_file): """\ Load and rename the velocyto ouput data Parameters ---------- loom_file Required The file containg the .loom file to be read in, replace Required String to replace to 'possorted_genome_bam_GGCIQ:' with in cell barcodes ------- """ self.adataV = sc.read_loom(loom_file, var_names='Gene') display(self.adataV.obs) remove = input('Phase to remove in cell barcodes: ') replace = input( 'Phrase to replace the removed phrase of cell barcodes: ') self.adataV.obs.index = self.adataV.obs.index.str.replace( remove, replace) self.adataV.obs.index = self.adataV.obs.index.str.replace('x', '')
def initialFiltering(args): sc.logging.print_versions() ################################################## # read unfiltered data from a loom file adata = sc.read_loom(args.loom_input) ################################################## # basic filtering / stats nCountsPerGene = np.sum(adata.X, axis=0) nCellsPerGene = np.sum(adata.X > 0, axis=0) # Show info print("Number of counts (in the dataset units) per gene:", nCountsPerGene.min(), " - ", nCountsPerGene.max()) print("Number of cells in which each gene is detected:", nCellsPerGene.min(), " - ", nCellsPerGene.max()) nCells = adata.X.shape[0] # pySCENIC thresholds minCountsPerGene = 3 * .01 * nCells # 3 counts in 1% of cells print("minCountsPerGene: ", minCountsPerGene) minSamples = .01 * nCells # 1% of cells print("minSamples: ", minSamples) #################### # initial cuts sc.pp.filter_cells(adata, min_genes=args.thr_min_genes) sc.pp.filter_genes(adata, min_cells=args.thr_min_cells) #################### # mito and genes/counts cuts mito_genes = adata.var_names.str.startswith('MT-') # for each cell compute fraction of counts in mito genes vs. all genes if (sum(mito_genes) == 0): adata.obs['percent_mito'] = 0.0 else: adata.obs['percent_mito'] = np.ravel( np.sum(np.asmatrix(adata[:, mito_genes].X), axis=1)) / np.ravel( np.sum(adata.X, axis=1)) # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1)) #################### # plotting: sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True) sc.pl.scatter(adata, x='n_counts', y='percent_mito') sc.pl.scatter(adata, x='n_counts', y='n_genes') adata = adata[adata.obs['n_genes'] < args.thr_n_genes, :] adata = adata[adata.obs['percent_mito'] < args.thr_pct_mito, :] ################################################## # output to loom file: row_attrs = { "Gene": np.array(adata.var_names), } col_attrs = { "CellID": np.array(adata.obs_names), "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(), "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(), } lp.create(args.loom_filtered, adata.X.transpose(), row_attrs, col_attrs) adata.write(args.anndata)
adata.var.index = adata.var.index.astype(str) # Check if var index is unique if len(np.unique(adata.var.index)) < len( adata.var.index) and not args.make_var_index_unique: raise Exception( "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config." ) if len(np.unique(adata.var.index)) < len( adata.var.index) and args.make_var_index_unique: adata.var_names_make_unique() print("Making AnnData var index unique...") # Sort var index adata = adata[:, np.sort(adata.var.index)] adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME)) elif INPUT_FORMAT == 'loom' and OUTPUT_FORMAT == 'h5ad': adata = sc.read_loom(FILE_PATH_IN, sparse=True, validate=False) adata = add_sample_id(adata=adata, args=args) # If is tag_cell_with_sample_id is given, add the sample ID as suffix if args.tag_cell_with_sample_id: adata = tag_cell(adata=adata, tag=args.sample_id, remove_10x_gem_well=args.remove_10x_gem_well) adata.var.index = adata.var.index.astype(str) # Check if var index is unique if len(np.unique(adata.var.index)) < len( adata.var.index) and not args.make_var_index_unique: raise Exception( "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config." ) if len(np.unique(adata.var.index)) < len( adata.var.index) and args.make_var_index_unique:
sc.pp.regress_out(adata, ['orig_ident']) #scale data sc.pp.scale(adata, max_value=10) #calculate PCA sc.tl.pca(adata, svd_solver='arpack', n_comps=50) return adata #define marker genes for plotting genes = [ 'Muc2', 'Aqp8', 'Krt20', 'Isg15', 'Reg3g', 'Chga', 'Top2a', 'Lgr5', 'Smoc2', 'Ascl2', 'Cdk4', 'Rps15a' ] cycling = sc.read_loom( "/Users/fr7/git_repos/single_cell/experiment_4/FINAL/integrated/control/cycling.loom", sparse=True) sc.pp.filter_genes(cycling, min_counts=1) cycling = pre_process(cycling) sc.pl.pca_variance_ratio(cycling, log=True, n_pcs=50, save=".cycling.png") sc.pp.neighbors(cycling, n_pcs=35, n_neighbors=50) sc.tl.umap(cycling) sc.pl.umap(cycling, color=genes, save=".cycling.png") sc.tl.leiden(cycling, resolution=1) sc.pl.umap(cycling, color=['leiden', 'ClusterName'],
from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report # Custom classes for classification import classifiersV3 as cl # Plotting import matplotlib.pyplot as plt import seaborn as sns import matplotlib matplotlib.use('Agg') from matplotlib.backends.backend_pdf import PdfPages # Load up loom file print('\nLoading data...') ccAF1_scanpy = sc.read_loom('data/cellcycle_int_integrated.loom') # Parameters for CV numSamples = 1000 nfolds = 100 # Initialize helper vars/indices for subsetting data (train/test) nCells = ccAF1_scanpy.shape[0] allInds = np.arange(0, nCells) # Precompute testing and training data sets trainInds = [] truelab = [] testInds = [] mapMe_ACTINN = [] errorACTINN = []
adata.var.index = adata.var.index.astype(str) # Check if var index is unique if len(np.unique(adata.var.index)) < len( adata.var.index) and not args.make_var_index_unique: raise Exception( "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config." ) if len(np.unique(adata.var.index)) < len( adata.var.index) and args.make_var_index_unique: adata.var_names_make_unique() print("Making AnnData var index unique...") # Sort var index adata = adata[:, np.sort(adata.var.index)] adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME)) elif INPUT_FORMAT == 'loom' and OUTPUT_FORMAT == 'h5ad': adata = sc.read_loom(FILE_PATH_IN, sparse=False) adata = add_sample_id(adata=adata, args=args) # If is tag_cell_with_sample_id is given, add the sample ID as suffix if args.tag_cell_with_sample_id: adata = tag_cell(adata=adata, tag=args.sample_id) adata.var.index = adata.var.index.astype(str) # Check if var index is unique if len(np.unique(adata.var.index)) < len( adata.var.index) and not args.make_var_index_unique: raise Exception( "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config." ) if len(np.unique(adata.var.index)) < len( adata.var.index) and args.make_var_index_unique: adata.var_names_make_unique() print("Making AnnData var index unique...")
# write h5ad file rna_adata.write(results_file) #========================== # use seurat for clustering #========================== ''' seurat.ipynb ''' #========================== #import clusters from seurat #========================== import scanpy as sc seurat = sc.read_loom('seurate_cluster_RNA.loom', sparse=False, obs_names='CellID', var_names='Gene', dtype='float32') seurat.obs['clusters']=seurat.obs['seurat_clusters'] seurat_raw=seurat seurat.var.index=[gene_info['gene_id_to_name'][gene] for gene in seurat.var.index.values] seurat.var_names_make_unique() sc.tl.rank_genes_groups(seurat, 'clusters', method='wilcoxon') sc.pl.rank_genes_groups(seurat, n_genes=25, sharey=False) # plot heatmap of specific genes specific_genes=pd.DataFrame(seurat.uns['rank_genes_groups']['names']).loc[0:20,:].T.values.flatten() X=pd.DataFrame(seurat[:,specific_genes].layers['norm_data']).T X.index=specific_genes X.columns=seurat.obs['clusters'] from scale_plot_feature import plot_heatmap
sc.pp.regress_out(adata, ['orig_ident']) #scale data sc.pp.scale(adata, max_value=10) #calculate PCA sc.tl.pca(adata, svd_solver='arpack', n_comps=100) return adata #define marker genes for plotting genes = [ 'Muc2', 'Aqp8', 'Krt20', 'Isg15', 'Reg3g', 'Chga', 'Top2a', 'Lgr5', 'Smoc2', 'Ascl2', 'Fer1l6' ] control = sc.read_loom( "/Users/fr7/git_repos/single_cell/experiment_4/FINAL/merge/control.loom", sparse=True) #pre-process control = pre_process(control) #plots to select number of components for clustering sc.pl.pca_variance_ratio(control, log=True, n_pcs=100) #compute the neighbourhood graphs sc.pp.neighbors(control, n_pcs=35, n_neighbors=4) #might want to play with n_neighbours #plot UMAPs sc.tl.umap(control) sc.pl.umap(control, color=genes, save=".control.png")
#!/data/mcgaugheyd/conda/envs/scVI/bin/python import sys import scarches as sca import scanpy as sc import pandas as pd args = sys.argv adata = sc.read_loom(args[1]) print(adata.X) n_epochs = int(args[2]) lr = float(args[3]) if args[4] == 'True': use_cuda = True else: use_cuda = False n_hidden = int(args[5]) n_latent = int(args[6]) n_hvg = int(args[7]) adata.obs['batch'] = adata.obs['batch'].astype('category') print(str(n_epochs) + " epochs") print(str(lr) + " learning rate") print(str(n_hidden) + " hidden layers") print(str(n_latent) + " latent dims") print(str(n_hvg) + " HVG")
hgncAliasEnsmbl = {} for i in hgncEnsmbl.loc[~hgncEnsmbl['Alias symbols'].isnull()].index: splitUp = hgncEnsmbl.loc[i, 'Alias symbols'].split(', ') ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)'] for j in splitUp: hgncAliasEnsmbl[j] = ensmbl # Load up loom file for set1 in [ 'GSE70630', 'GSE89567', 'GSE854465_all', 'GSE131928_10X', 'GSE131928_Smartseq2', 'Bhaduri', 'GSE139448', 'GSE102130' ]: if not exists('results/table1_' + str(set1) + '.csv'): print('\nLoading ' + set1 + ' data...') set1_scanpy = sc.read_loom('data/forClassification/gliomas/' + set1 + '.loom') set1_scanpy = set1_scanpy[:, [ True if i in hgncEnsmbl.index or i in hgncPrevEnsmbl or i in hgncAliasEnsmbl else False for i in set1_scanpy.var_names ]] # Convert to ensmbl convertMe_hgnc = pd.DataFrame(np.nan, index=set1_scanpy.var_names, columns=['ensembl']) numConverted_hgnc = 0 missed = [] for j in set1_scanpy.var_names: if j in hgncEnsmbl.index: convertMe_hgnc.loc[j, 'ensembl'] = hgncEnsmbl.loc[ j, 'Ensembl ID(supplied by Ensembl)'] elif j in hgncPrevEnsmbl:
import sys import tnn from tnn.tnn import * import scanpy as sc args = sys.argv adata = sc.read_loom(args[1], sparse = True) dims = float(args[2]) if type == 'counts': sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) adata.raw = adata # pca sc.tl.pca(adata) if 'onlyWE' in args[1]: k = 10 else: k = 50 # semi-supervised adata_tnn = adata.copy() tnn_model = TNN(k = k, distance = 'pn', batch_size = 64, n_epochs_without_progress = 4, epochs = 50, embedding_dims=dims, approx = True) #if 'onlyDROP' in args[1]:
from scvi.models import SCANVI from scvi.dataset.anndataset import AnnDatasetFromAnnData from scvi.inference import UnsupervisedTrainer, JointSemiSupervisedTrainer, SemiSupervisedTrainer import scanpy as sc import numpy as np import torch in_loom_train = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/paper.loom" #in_loom_train = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/paper_.loom" in_loom_test = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/hca_cellTypes.loom" #--------------------------- # Example of pre-processing adata = sc.read_loom(in_loom_train) adata_test = sc.read_loom(in_loom_test) adata.var_names_make_unique() adata_test.var_names_make_unique() # PRE-PROCESS # First find do log1p sc.pp.filter_genes(adata, min_cells=int(0.05 * adata.shape[0])) sc.pp.log1p(adata) sc.pp.log1p(adata_test) # Then find variable genes sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat") # Label test data adata.obs["scanvi_test"] = False adata_test.obs["scanvi_test"] = True
def load_genotypes(self, suffix:str, from_loom:bool = False): # load genotyping data from hdf5 compressed file self.filename = os.path.join(self.path['out_folder'], suffix + '_genotype.loom') if from_loom: try: self.genotypes = sc.read_loom(self.filename) except ValueError: print('loom file not found, check paths or try to load from raw files') return with h5py.File(self.path['genotypes_path'], 'r') as f: # import hdf5 layers into arrays cell_barcodes = copy.deepcopy([c.decode('utf8') for c in f['CELL_BARCODES']]) variants = copy.deepcopy([v.decode('utf8') for v in f['VARIANTS']]) # cells with no abs, no genotype no_abs = list(set(cell_barcodes) - set(np.array(self.pat_cell_barcodes_ab[0]))) genotypes = pd.DataFrame(np.transpose(f['GT']), index=cell_barcodes, columns=variants).sort_index() genotypes.index.name = 'cell_barcode' sample_name = ['abseq' + c.split('-')[-1] for c in genotypes.index] genotypes['sample_name'] = sample_name genotypes.set_index([genotypes.index, 'sample_name'], inplace=True) self.genotypes_noAb = genotypes.loc[no_abs] # may have to create loom file for this too, won't be loaded if starting from loom genotypes = genotypes.drop(index=no_abs) genotypes[genotypes.isnull()] = 3 #adata = ad.AnnData(np.array(genotypes), dtype='int') #adata.obs['cell_barcode'] = genotypes.index #adata.var['variant_name'] = genotypes.columns #adata.filename = os.path.join(self.path['out_folder'], self.filename) loompy.create(self.filename ,np.array(genotypes), {'cell_barcode':np.array(genotypes.index)}, {'sample_name':np.array(genotypes.columns)}) del genotypes quality = pd.DataFrame(np.transpose(f['GQ']), index=cell_barcodes, columns=variants).sort_index() quality.index.name = 'cell_barcode' quality = quality.drop(index=no_abs) with loompy.connect(self.filename) as ds: ds.layers['quality'] = np.array(quality).astype(int) #adata.layers['quality'] = np.array(total_depth).astype(int) del quality total_depth = pd.DataFrame(np.transpose(f['DP']), index=cell_barcodes, columns=variants).sort_index() total_depth.index.name = 'cell_barcode' total_depth = total_depth.drop(index=no_abs) with loompy.connect(self.filename) as ds: ds.layers['total_depth'] = np.array(total_depth).astype(int) #adata.layers['total_depth'] = np.array(total_depth).astype(int) del total_depth alt_depth = pd.DataFrame(np.transpose(f['AD']), index=cell_barcodes, columns=variants).sort_index() alt_depth.index.name = 'cell_barcode' alt_depth = alt_depth.drop(index=no_abs) with loompy.connect(self.filename) as ds: ds.layers['alt_depth'] = np.array(alt_depth).astype(int) #adata.layers['alt_depth'] = np.array(alt_depth).astype(int) del alt_depth #adata.write(os.path.join(self.path['out_folder'], self.filename)) with loompy.connect(self.filename) as ds: self.harmonize_Abs(ds) #self.genotypes = adata # calculate vaf - nan for division by 0 #vaf = np.divide(alt_depth, total_depth) return
splitUp = hgncEnsmbl.loc[i, 'Previous symbols'].split(', ') ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)'] for j in splitUp: hgncPrevEnsmbl[j] = ensmbl hgncAliasEnsmbl = {} for i in hgncEnsmbl.loc[~hgncEnsmbl['Alias symbols'].isnull()].index: splitUp = hgncEnsmbl.loc[i, 'Alias symbols'].split(', ') ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)'] for j in splitUp: hgncAliasEnsmbl[j] = ensmbl ## 3. Load up data # Load up loom file print('\nLoading WT data...') adata1 = sc.read_loom('data/WT.loom') ## 4. Convert to Ensembl gene IDs # Convert from gene symbols to Ensmbl gene IDs adata1 = adata1[:, [ True if i in hgncEnsmbl.index or i in hgncPrevEnsmbl or i in hgncAliasEnsmbl else False for i in adata1.var_names ]] convertMe_hgnc = pd.DataFrame(np.nan, index=adata1.var_names, columns=['ensembl']) numConverted_hgnc = 0 missed = [] for j in adata1.var_names: if j in hgncEnsmbl.index: convertMe_hgnc.loc[j, 'ensembl'] = hgncEnsmbl.loc[
def HDS( path1=None, clusters=None, genes=None, per=.1, # pv=0.025, co=.9, r_c=0, min_genes=200, min_cells=3, n_genes_by_counts=2500, pct_counts_mt=5, resolution=1): # path variables path = {} path['path'] = path1 root = os.getcwd() + '/' + 'loom_data' if not os.path.exists(os.getcwd() + '/' + 'loom_data'): os.mkdir(root) adata = scp.read_loom(path['path'], sparse=False, X_name='spliced') if not clusters: adata = clustering(adata, min_genes, min_cells, n_genes_by_counts, pct_counts_mt, resolution) else: adata.obs['leiden'] = clusters path["loom"] = root + "/temp.loom" path["metadata"] = root + "/metadata.csv" path["sigi"] = root + "/significant_restoration_genes_across_all_cells.csv" path["sigd"] = root + "/significant_hb_genes_across_all_cells.csv" path['r2'] = root + "/r2_all_cells.csv" path[ "velocity_age_0p025"] = root + "/Velocity_0p025_age_specific_gamma.csv" # plots save location path['save'] = root + "/" # fitting gamma vlm = fitting_gamma(adata, path) # compute r-squared leiden wise leiden = np.unique(vlm.ca['leiden']) r2 = r2_feature_wise(path, vlm, feature='leiden', m="r2") # mis = r2_feature_wise(path, vlm, feature='leiden',m="mis") violin_plotting(path, r2, per=per) # getting_cor(r2,pv=0.025,co=.9) # r2_cut(path, r_c, r2) # count_genes() # sigi = pd.read_csv(path["sigi"], index_col=0) # sigd = pd.read_csv(path["sigd"], index_col=0) if genes: genes = np.intersect1d(genes, r2.index) if len(genes) > 0: plot_genes_r2_phase_portrait_supp_hb(path, vlm, r2, genes, figname=True) else: print("no common gene found!") else: genes = r2.index[:5] plot_genes_r2_phase_portrait_supp_hb(path, vlm, r2, genes, figname=True) """genes=sigi.index[:5] plot_genes_r2_phase_portrait_supp_hb(path, vlm, sigi, genes, figname=True) genes=sigd.index[:5] plot_genes_r2_phase_portrait_supp_hb(path, vlm, sigd, genes, figname=True)""" return r2
#------------------------------------ import numpy as np import matplotlib.pyplot as pl from matplotlib.pyplot import plot, savefig import scanpy as sc # fig setting sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() results_file = './testnoOLCs.h5ad' sc.settings.set_figure_params(dpi=300) # loading the dataset adata = sc.read_loom( 'data/DataSetA.neurogenicSubset.seurat.pca10.umap.tsne.annotation.noOLCs.loom' ) # pre-process, diffmap and trajectory inference adata.uns['iroot'] = np.flatnonzero(adata.obs['ClusterName'] == 'qNSCs')[10] sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss') sc.tl.diffmap(adata, n_comps=10) sc.tl.dpt(adata, n_branchings=0, n_dcs=10) #adata = read_h5ad('./test.h5ad') adata.obsm['tsne'] = adata.obsm['tsne_cell_embeddings'] fig, ax = pl.subplots(figsize=(12, 12)) axs = sc.pl.tsne(adata, color=['dpt_pseudotime', 'ClusterName'], show=False) pl.show() pl.savefig('./testnoOLCs.pdf') adata.write(results_file)
import scvelo as scv import scanpy as sc # read adata object adata = sc.read_loom('path_to_file') # Normalize and calculate the moments scv.pp.normalize_per_cell(adata) scv.pp.moments(adata, n_pcs=30, n_neighbors=30) #Calculate the velocities scv.tl.velocity(adata) # Here you could first filter cells. # So they dont get included in the velocity graph scv.tl.velocity_graph(adata) #Command in scvelo and scanpy both work on the same object # so we can switch between the two modules # calculate tsne, with optional arguments sc.tl.tsne(adata) scv.tl.tsne(adata,perplexity=50, early_exaggeration=100,learning_rate=100,random_state=108, n_jobs=12) # write tsne to file. scv.pl.tsne(adata, color='Stimulation', save='path_to_save_figure') scv.pl.tsne(adata, color='Batch', save='path_to_save_figure') #calculate the velocity embedding by project velocities on the tsne embedding
#figure_file = open('figures%s.png' % i, 'w') sc.settings.set_figure_params(dpi=80) #adata = sc.read_loom('PC9Combined.loom', var_names = 'gene_symbols', sparse = True, cleanup = False, X_name = 'spliced', obs_names = 'CellID') #day00a = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D0.1500.dge", var_names = 'gene_symbols', delimiter = "\t") #day01 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D1.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t") #day02 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D2.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t") #day04 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D4.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t") #day09 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D9.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t") #day11a = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D11.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t") day00 = sc.read_loom('Day0.loom', var_names='gene_symbols', sparse=True, cleanup=False, X_name='spliced', obs_names='CellID') day01 = sc.read_loom('Day01.loom', var_names='gene_symbols', sparse=True, cleanup=False, X_name='spliced', obs_names='CellID') day02 = sc.read_loom('Day02.loom', var_names='gene_symbols', sparse=True, cleanup=False, X_name='spliced', obs_names='CellID') day04 = sc.read_loom('Day04.loom',
############################################################## ## Definition of Auxilliary functions ## Subsetting adata by top HVGs def subset_hvg(adata, n_top): sc.pp.normalize_total(adata, target_sum=10000) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=n_top) adata = adata[:, adata.var['highly_variable']] return(adata) ############################################################# ## Reading reference and query datasets annData = sc.read_loom(snakemake.input.reference_loom) target_annData = sc.read_loom(snakemake.input.query_loom) ############################################################# ## ## ## Harmonizing reference and query datasets ## ## ## ############################################################# ## Subsetting annotations to cell type and batches annData.obs = annData.obs[ [snakemake.params.ref_cell_type_column, snakemake.params.ref_batch_column] ]