def write_filtered(self, biotypes=['lincRNA', 'antisense'], removeMIR=True, subsample=False, n_obs=2000, min_counts=1000, min_genes=300, min_cells=0, max_mito=20): outdir = os.path.join(self.outdir, "filter") mkdir(outdir) adatas = self.load_mtx(biotypes=biotypes, removeMIR=removeMIR) from scimmunity.filtering import filter_and_merge # preprocess, filter, and merge adata = filter_and_merge(adatas, self.sample_names, outdir, subsample=subsample, n_obs=n_obs, min_counts=min_counts, min_genes=min_genes, min_cells=min_cells, max_mito=max_mito) adata = self.add_metadata_to_adata(adata) # write adata adata.write(self.filtered) return
def infercnv(self, reduction, ref_groups, annotation_key='Phenotype', sample_key='sample_name', gene_order_dir=None, use_name=True, write=True, cores=4, mem=32, partition='all', time=24): import infercnv.pipeline as cnv if gene_order_dir is None: gene_order_dir = cnv.GENE_ORDER_DIR out = reduction.out.replace('reduction', 'infercnv') mkdir(out) cnv.run_all_samples(reduction.adata, annotation_key, sample_key, ref_groups, \ self.reference, out=out, gene_order_dir=gene_order_dir, use_name=use_name, write=write, \ cores=cores, mem=mem, partition=partition, time=time) return ###
def set_highest_ranked_phenotype(self, df_rank, name='', keepcluster=False): cluster2phenotype = {} for col in df_rank.columns: # if equally ranked, set phenotype as NA if len(df_rank[col].unique()) == 1: cluster2phenotype[col] = 'NA' else: cluster2phenotype[col] = str(df_rank[col].idxmin()) # make new clustering if len(name) > 0: name = '_' + name if keepcluster: key = 'phenotype_' + self.clustering + name self.adata.obs[key] = self.adata.obs[self.clustering].apply(\ lambda x:'{}:{}'.format(x, cluster2phenotype[x])) else: key = 'phenotype' + name self.adata.obs[key] = self.adata.obs[self.clustering].apply(\ lambda x:cluster2phenotype[x]) out = os.path.join(self.out, 'phenotype') mkdir(out) plot_reps(self.adata, key, save_name=key, outdir=out) plot_reps(self.adata, key, save_name=key+'_ondata', outdir=out, \ legend_loc='on data', legend_fontweight='normal', legend_fontsize=10) return cluster2phenotype
def __init__(self, outdir, filtered, scaled, out='batchcorrect', \ n_epochs=50, use_batches=True, use_cuda=False, n_latent=30, train_size=1.0): ''' Args: outdir (str): path to analysis output directory filtered (str): path to filtered raw adata input scaled (str): path to scaled adata output out (str, optional): output subfolder name ''' self.outdir = outdir self.out = os.path.join(outdir, out) self.filtered = filtered self.scaled = scaled # load raw filtered dataset self.gene_dataset = self.load_filtered() # scvi model variables self.n_epochs = n_epochs self.use_batches = use_batches self.use_cuda = use_cuda self.n_latent = n_latent self.train_size = train_size self.vae = self.get_vae() self.trainer = self.get_trainer(self.vae, self.train_size) # make output folder mkdir(self.out) return
def plot_signature_dict(self, groupby, markersets, mode='heatmap', layers=['corrected_regressed', 'normalized_regressed']): out = self.out.replace('reduction', 'heatmap') mkdir(out) plot_phenotype_markerset(self.adata, groupby, markersets, out=out, mode=mode, layers=layers) return
def run_pca(self, n_comps=50): mkdir(os.path.join(self.out,'pca')) sc.settings.figdir = os.path.join(self.out,'pca') sc.tl.pca(self.adata, n_comps=n_comps, svd_solver='auto', use_highly_variable=False) # arpack can give zero variance explained, so we use auto solver sc.settings.figdir = self.out sc.pl.pca_variance_ratio(self.adata, log=False, save=True) self.set_n_pcs(min_n_pcs=self.min_n_pcs)
def save_de(adata, label_name, method, layer, comparison, out='./', filtered=False, query=True, pval_cutoff=5e-3, log2fc_min=1, enrich=True): """ Save rank genes groups results as csv per group. Annotate and enrich up- and down-regulated genes per group filtered based on criteria. """ filt = "_filtered" * filtered method_name = f"{method}_{layer}_{comparison}" + filt key = f"rank_genes_groups_{label_name}_{method_name}" # get overview of ranked genes df_ranked_genes = get_df_ranked_genes(adata, key) outdir = os.path.join(out, label_name, method_name) mkdir(outdir) df_ranked_genes.to_csv(os.path.join(outdir, f'{method_name}.csv')) # get up- and down-regulated per group based on criteria for i in adata.uns[key]['names'].dtype.names: df_up = rank_genes_groups_df(adata, group=i, key=key, pval_cutoff=pval_cutoff, log2fc_min=log2fc_min, log2fc_max=None) df_down = rank_genes_groups_df(adata, group=i, key=key, pval_cutoff=pval_cutoff, log2fc_max=-log2fc_min, log2fc_min=None) for df, direction in zip([df_up, df_down], ['up', 'down']): df = df.loc[(df['names'] != '') & (~df['names'].isnull()), :] name = f"{method}_{layer}_{clean_up_str(i)}_{comparison+filt}" path = os.path.join(outdir, f"{name}_{direction}.csv") # gene name and description annotation if (len(df) > 0) and query: df_annotation = annotate_de(df) df_annotation.to_csv(path) else: df.to_csv(path) # Gene set enrichment if (len(df) > 0) and enrich: name = f"{clean_up_str(i)}_{direction}" gsea(list(df.names), description=name, out=os.path.join(outdir, name)) return
def plot_clustering_metrics(self, reps=['latent_regressed', 'latent', 'pcs']): for rep in reps: out = os.path.join(self.out, 'clustering', rep) mkdir(out) plot_variance_ratio(self.adata, self.res_list, X=rep, prefix=self.prefix, rep=rep, out=out) plot_silhouette_coeff(self.adata, self.res_list, X=rep, prefix=self.prefix, rep=rep, out=out) return
def save_de_overlap(adata, key, markersets, out='./'): combined_dict = {} for markerset in markersets: signature_dict = get_signature_dict(markerset, adata=adata) combined_dict.update(signature_dict) df = de_overlap(adata, key, combined_dict) mkdir(out) df.to_csv(os.path.join(out, f'de_overlap_{key}_{markerset}.csv'), index=False) # df.to_csv('{}de_overlap_{}_{}.csv'.format(out, key, markerset), index=False) return
def corr_rep_gene_dict(adata, markerset, rep, prefix='', dims=[0, 1], offset=1, layer=None, out='./', save=True): """ Calculate correlation coefficient R of each component with each gene. Then for each component take the average of R for each gene signature/phenotype in the markerset. Args: adata (AnnData) markerset: name of markerset to annotate with rep (str): name of dimension reduction representation dims (list of int): indices (0-based) of components from dimension reduction offset (int): Offset indices to skip unwanted component (ex. 1 for diffmap, 0 for others) layer (str): adata.layers key to get gene expression from out (str): output path Return: Saves correlation matrix as csv and heatmap. """ mkdir(out) # load markerset gene_dict = get_signature_dict(markerset, adata=adata) phenotypes = list(gene_dict.keys()) df = pd.DataFrame(index=phenotypes) for i in dims: df_list = [] corr = corr_comp_gene(adata, rep, i, offset=offset, layer=layer) rs = [] for phenotype in phenotypes: genes = gene_dict[phenotype] subset = corr.loc[genes, :] subset['phenotype'] = phenotype avg_r = corr.loc[genes, 'R'].mean() rs.append(avg_r) df_list.append(subset.sort_values('R')) df['{}{}'.format(prefix, i + 1)] = rs pd.concat(df_list).to_csv('{}{}_{}{}.csv'.format( out, markerset, prefix, i + 1)) fig, ax = plt.subplots(figsize=(6, 6)) sns.heatmap(df, cmap='coolwarm', center=0, ax=ax) plt.tight_layout() plt.savefig('{}{}_{}_{}_avg_gene_corr.png'.format(out, rep, markerset, layer)) plt.close() if save: df.to_csv('{}{}_{}_{}_avg_gene_corr.csv'.format( out, rep, markerset, layer)) return df
def call_doublet(self, reduction, sample_key='sample_name', thresh_list=[]): from scimmunity.qc import call_doublet_per_sample out = reduction.out.replace('reduction', 'doublet') mkdir(out) call_doublet_per_sample(reduction.adata, sample_key, thresh_list=thresh_list, out=out) return
def plot_reps_signature_dict(adata, markerset, use_raw=False, layer=None, out='./', reps=['umap', 'diffmap'], figsize=(4, 4)): mkdir(out) signature_dict = get_signature_dict(markerset, adata=adata) for celltype, genes in signature_dict.items(): plot_reps_markers(adata, genes, markerset+'_'+celltype.strip(), \ outdir=out, reps=reps, use_raw=use_raw, layer=layer, figsize=figsize) return
def plot_reps_signature_dict(self, signature_dict, markerset, use_raw=False, dpi=300, layer=None, figsize=(2, 2)): old_dpi = rcParams["savefig.dpi"] rcParams["savefig.dpi"] = dpi out = os.path.join(self.out, 'expression') mkdir(out) for celltype, genes in signature_dict.items(): plot_reps_markers(self.adata, genes, markerset+'_'+celltype.strip(), \ outdir=out, reps=self.reps, use_raw=use_raw, layer=layer, figsize=figsize) rcParams["savefig.dpi"] = old_dpi return
def __init__(self, adata, outdir, clustering, out='annotation', bulkprofiles=BULKPROFILES, markersets=MARKERSETS, pop2markersetchoices=POP2MARKERSETCHOICES, pop2markersets=POP2MARKERSETS, pop2bulkprofiles=POP2BULKPROFILES, pop2phenotype=POP2PHENOTYPE, reps=None): """ Automatic cluster annotation 1) Avg cell marker detection rate (fraction of total genes detected) 2) Correlate cluster centroid with bulk profiles - use only genes with coeff. of variation >20% in bulk dataset? 3) compare DE genes to know marker genes Args: h5ad (str): path to '.h5ad' file out (str): name of subfolder for output (default: annotation) clustering (key): key name of clustering stored in adata.obs """ # self.h5ad = h5ad # self.out = '{}/{}/{}/'.format(os.path.dirname(self.h5ad), out, clustering) self.out = os.path.join(outdir, clustering) self.adata = adata self.clustering = clustering self.bulkprofiles = bulkprofiles self.markersets = markersets self.pop2markersetchoices = pop2markersetchoices self.pop2markersets = pop2markersets self.pop2phenotype = pop2phenotype self.pop2bulkprofiles = pop2bulkprofiles if reps is None: self.reps = [ 'pca', 'umap_pcs', 'umap_latent', 'umap_latent_regressed', 'tsne_pcs', 'tsne_latent', 'tsne_latent_regressed', 'diffmap_pcs', 'diffmap_latent', 'diffmap_latent_regressed' ] else: self.reps = reps # make output folder mkdir(self.out) return
def plot_frequency(self, reduction, x, y, xrot=0, yrot=45, xorder=None, yorder=None, sort_x=False, sort_y=False, explode=[], swap_axes=True, dropna=False, **kwargs): import scimmunity.frequency as freq subfolder = f"{clean_up_str(x)}_{clean_up_str(y)}" out = os.path.join(reduction.out.replace('reduction', 'frequency'), subfolder) mkdir(out) df = reduction.adata.obs.copy() if len(explode) > 0: for cols in explode: if type(cols) == str: cols = [cols] df = tcr.explode_strs(df, cols, ';') if sort_x: props = df.groupby(x)[y].count().sort_values(ascending=False) sort_order = list(props.index) print(sort_order) if xorder is not None: xorder = [x for x in sort_order if x in xorder] else: xorder = sort_order if sort_y: props = df.groupby(y)[x].count().sort_values(ascending=False) sort_order = props.index if yorder is not None: yorder = [y for y in sort_order if y in yorder] else: yorder = sort_order freq.plots(reduction.adata, df, \ out, x=x, y=y, xrot=xrot, yrot=yrot, xorder=xorder, yorder=yorder, swap_axes=swap_axes, dropna=dropna, **kwargs) freq.save_df(x, y, df, 'Frequency', out, dropna=dropna) freq.save_df(x, y, df, 'Count', out, dropna=dropna) return xorder, yorder
def set_annotation(self, population, pop2phenotype, markersets=None): phenotype = pop2phenotype[population] self.adata.obs['Phenotype'] = self.adata.obs[phenotype] # reset phenotype colors to avoid slicing error if 'Phenotype_colors' in self.adata.uns: del self.adata.uns['Phenotype_colors'] # plot the set phenotype out = os.path.join(self.out, 'phenotype') mkdir(out) plot_reps(self.adata, 'Phenotype', save_name='Phenotype' + '_ondata', outdir=out, reps=self.reps, legend_loc='on data', legend_fontweight='normal', legend_fontsize=10) plot_reps(self.adata, 'Phenotype', save_name='Phenotype', outdir=out, reps=self.reps) # plot markerset heatmap if markersets is None: markersets = self.pop2markersets[population] plot_phenotype_markerset(self.adata, 'Phenotype', markersets, out=self.out, mode='heatmap') plot_phenotype_markerset(self.adata, 'Phenotype', markersets, out=self.out, mode='matrixplot') plot_phenotype_markerset(self.adata, 'Phenotype', markersets, out=self.out, mode='dotplot') return
def annotate_comp(self, reduction, rep, prefix='', dims=[0, 1], offset=1, layer=None, thresh=0.8, markersets=[], gsets=[ 'GO_Biological_Process_2018', 'KEGG_2019_Human', 'WikiPathways_2019_Human' ]): from scvi_analysis.component import corr_rep_gene_dict, corr_rep_gene out = os.path.join(reduction.out.replace('reduction', 'annotation'), rep) mkdir(out) corr_rep_gene(reduction.adata, rep, prefix=prefix, dims=dims, offset=offset, layer=layer, thresh=thresh, gsets=gsets, out=out) for markerset in markersets: corr_rep_gene_dict(reduction.adata, markerset, rep, prefix=prefix, dims=dims, offset=offset, layer=layer, out=out) return
def __init__(self, outdir, parent_name='Whole', parent_h5ad='corrected.h5ad', subset_name='Whole', subset_h5ad='corrected.h5ad', subset_cond=dict(), subfolder='reduction', pca_s=1.0, min_n_pcs=5, n_neighbors=20, n_jobs=None, verify_barcodes=False, neighbors_reps=['pcs', 'latent', 'latent_regressed'], default_rep='pcs', reductions=['pca', 'umap', 'tsne', 'diffmap'], res_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2], regress=False, regress_vars={ 'latent':['percent_mito'], 'normalized':['n_counts', 'percent_mito', 'S_score', 'G2M_score'], 'corrected':['percent_mito'] } ): """ pcs_s (float): sensitivity for knee detection in determing number of PCs n_neighbors (int): number of neighbors for constructing neighborhood graph n_jobs (int): number of jobs for regressing out variable subset_name (str): name of subset subset_cond (dict of str to list): dictionary mapping obs column to list of values to include (ex. {'louvain':['0', '1', '2']}) neighbors_reps (list): Representations used for neighborhood graphs default_rep (list): Default representation for neighborhood graph reductions (list): List of dimension reduction to perform res_list (list): List of clustering resolution to perform """ self.outdir = outdir self.parent = os.path.join(self.outdir, parent_name, parent_h5ad) self.subset = os.path.join(self.outdir, subset_name, subset_h5ad) self.subset_name = subset_name self.subset_cond = subset_cond self.subfolder = subfolder # set output folders self.out = os.path.join(self.outdir, subset_name, subfolder) self.prefix = subset_name # make output folder mkdir(self.out) sc.settings.figdir = self.out if not os.path.isfile(self.subset): # create new subset if none exists self.adata = self.create_subset() else: # load existing subset adata self.adata = sc.read(self.subset) if verify_barcodes: # verify that the subset conditions give the same barcodes self.verify_barcodes() # Parameters for dimension reduction self.pca_s = pca_s self.min_n_pcs = min_n_pcs self.n_neighbors = n_neighbors self.n_jobs = n_jobs self.regress = regress self.regress_vars = regress_vars self.default_rep = default_rep # define representations used for constructing neighborhood graph self.neighbors_reps = neighbors_reps self.neighbors_kwargs_list = [{'use_rep': 'X_'+rep} \ for rep in self.neighbors_reps] self.neighbors_list = [] # store name of neighborhood graphs self.all_reps = reductions + \ [f'{x}_{y}'for x in reductions if x!='pca' for y in neighbors_reps] # Clustering resolutions self.res_list = res_list return
def __init__( self, name, samplesheet, gtf, scdir, sample_names=[], sample_name_col='sample_name', # library_ids=[], # library_id_col='library_id', whole='Whole', gex_col='gex', vdj_col='vdj', metadata_cols=[], dpi=300, n_epochs=50, use_batches=True, use_cuda=False, n_latent=30, train_size=1.0): self.samplesheet = pd.read_csv(samplesheet) if not sample_names: self.sample_names = self.samplesheet[sample_name_col].tolist() else: inds = self.samplesheet[sample_name_col].isin(sample_names) self.samplesheet = self.samplesheet[inds].reset_index(drop=True) self.sample_names = self.samplesheet[sample_name_col].tolist() self.gtf = gtf self.whole = whole self.alignments = self.samplesheet[gex_col].tolist() self.vdj_alignments = self.samplesheet[vdj_col].tolist() # import metadata if not metadata_cols: metadata_cols = list(self.samplesheet.columns) metadata_cols.remove(gex_col) metadata_cols.remove(vdj_col) self.metadata_cols = metadata_cols # scvi arguments self.scvi_kwargs = { 'n_epochs': n_epochs, 'use_batches': use_batches, 'use_cuda': use_cuda, 'n_latent': n_latent, 'train_size': train_size } # set analysis name self.name = name # set output paths self.scdir = scdir self.outdir = os.path.join(self.scdir, self.name) self.filtered = os.path.join(self.outdir, whole, 'filtered.h5ad') self.corrected = os.path.join(self.outdir, whole, 'corrected.h5ad') self.pkl = os.path.join(self.outdir, 'scvi.model.pkl') self.no_batch_pkl = os.path.join(self.outdir, 'no_batch_scvi.model.pkl') print('Analysis saved at ' + self.outdir) mkdir(self.outdir) # set working directory for cache files os.chdir(self.outdir) set_plotting_params(dpi=dpi) return