def run_gsea( cls_file: str, gct_file: str, gmt_file: str, save_dir: str ): """ Run GSEA :param cls_file: :param gct_file: :param gmt_file: :param save_dir: :return: """ assert os.path.exists(cls_file) assert os.path.exists(gct_file) assert os.path.exists(gmt_file) assert os.path.exists(save_dir) gseapy.gsea( data=gct_file, gene_sets=gmt_file, cls=cls_file, outdir=save_dir, processes=4, verbose=True )
def gsea_wrapper( counts: Counts, cell_labels: Union[List[int], np.ndarray], gene_set_gmt: str, output_dir: Union[Path, str], **kwargs, ): """ Uncoupled from `CellORM` infrastructure Args: counts: counts cell_labels: gene_set_gmt: output_dir: **kwargs: """ gct_df = counts.to_df().T gct_df.insert(0, "Description", "None") gct_df.index = gct_df.index.rename("NAME") kwargs = { **GSEA.DEFAULT_KWARGS, "data": gct_df, "gene_sets": gene_set_gmt, "cls": cell_labels, "outdir": str(output_dir), **kwargs, } print(kwargs["permutation_num"]) try: gp.gsea(**kwargs) except IndexError: raise SinglePopulationError(cell_labels[0])
def main(cls_file: str, gct_file: str, gene_set: str = 'GO_Biological_Process_2017b', permutation_type: str = 'phenotype', method: str = "ratio_of_classes", output_dir: str = "gsea_report", format: str = "pdf", permutation_num: int = 1000, threads: int = 1) -> None: """ Perform GSEA processing """ phenoA, phenoB, class_vector = gp.parser.gsea_cls_parser(cls_file) gene_exp = pd.read_table(gct_file, header=0, index_col=0, skiprows=2) gs_res = gp.gsea(data=gene_exp, cls=class_vector, permutation_type=permutation_type, permutation_num=permutation_num, outdir=output_dir, gene_sets=gene_set, method=method, processes=threads, format=format, graph_num=50)
def run(data, gmt, cls, permutation_type='phenotype', method='signal_to_noise', permution_num=1000): prefix = gp.__name__ + "." for importer, modname, ispkg in pkgutil.iter_modules( gp.__path__, prefix): if modname == "gseapy.gsea": module = __import__(modname, fromlist="dummy") vs = gp.__version__.split(".") if int(vs[0]) == 0 and int(vs[1]) < 9: module.ranking_metric = GSEA._ranking_metric else: module.ranking_metric = GSEA._ranking_metric2 gp.algorithm.ranking_metric = GSEA._ranking_metric res = gp.gsea(data, gmt, cls, permutation_type=permutation_type, permutation_num=permution_num, outdir=os.path.join( os.path.dirname(os.path.realpath(__file__)), 'images'), method=method) return GSEA(res.res2d, data, gmt, cls)
def run_gsea(gene_exp: str, gene_set: str, phenotype_class: str, permutations: int = 500, output_dir: str = GSEA): """Run GSEA on a given dataset with a given gene set. :param gene_exp: file with gene expression data :param gene_set: gmt files containing pathway gene sets :param phenotype_class: cls file containing information on class labels :param permutations: number of permutations :param output_dir: output directory :return: """ return gseapy.gsea( data=gene_exp, gene_sets=gene_set, cls=phenotype_class, # cls=class_vector max_size=3000, # set permutation_type to phenotype if samples >=15 permutation_type='phenotype', permutation_num=permutations, # reduce number to speed up test outdir=output_dir, # do not write output to disk no_plot=True, # Skip plotting processes=4, format='png', )
def calculate_genesets(): gs_res = gp.gsea(data=gene_exp, # or data='./P53_resampling_data.txt' gene_sets=gene_set, # enrichr library names cls= grouping, # cls=class_vector # set permutation_type to phenotype if samples >=15 permutation_type=permtype, permutation_num=100, # reduce number to speed up test outdir=None, # do not write output to disk no_plot=True, # Skip plotting method=statmethod, # or t_test processes=4, seed= 7, format='png') return(gs_res)
def run(data, gmt, cls, permutation_type='phenotype', method='signal_to_noise', permution_num=1000): prefix = gp.__name__ + "." for importer, modname, ispkg in pkgutil.iter_modules(gp.__path__, prefix): if modname == "gseapy.gsea": module = __import__(modname, fromlist="dummy") vs = gp.__version__.split(".") if int(vs[0]) == 0 and int(vs[1]) < 9: module.ranking_metric = GSEA._ranking_metric else: module.ranking_metric = GSEA._ranking_metric2 gp.algorithm.ranking_metric = GSEA._ranking_metric res = gp.gsea(data, gmt, cls, permutation_type=permutation_type, permutation_num=permution_num, outdir=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'images'), method=method) return GSEA(res.res2d, data, gmt, cls)
def run_gsea(self, ordered_df, classes, db=None, db_name=None, processes=4, no_plot=True, method='signal_to_noise', permutation_type='gene_set'): if db is None: db = hallmarks_db if db_name is None: db_name = db.split("/")[-1] gsea_analysis = gseapy.gsea(ordered_df, gene_sets=db, cls=classes, outdir=os.path.join(self.gsea_save, db_name), method=method, no_plot=no_plot, processes=processes, permutation_type=permutation_type) return gsea_analysis
def perform_gsea(data: Union[str, pd.DataFrame], gmt: str, class_vector: List, output_dir: str, min_size: int, max_size: int, permutation_type: str, permutation_num: int, method: str): """Run GSEA on a given dataset and geneset.""" return gseapy.gsea( data=data, gene_sets=gmt, cls=class_vector, min_size=min_size, max_size=max_size, permutation_type= permutation_type, # set permutation_type to phenotype if samples >=15 permutation_num=permutation_num, # reduce number to speed up test method=method, outdir=output_dir, no_plot=True, # Skip plotting processes=1, )
def do_gsea(matrix, cls, gmt, out_dir): """Run GSEA.""" mat = pd.read_csv(matrix, sep='\t', header=0, index_col=0) click.echo('Running GSEA') gs_res = gp.gsea( data=mat, # data matrix gene_sets=gmt, # enrichr library names cls=cls, # cls=class_vector # set permutation_type to phenotype if samples >=15 permutation_type='phenotype', permutation_num=100, # reduce number to speed up test outdir=out_dir, no_plot=True, # Skip plotting method='signal_to_noise', processes=4, format='png', ) out_path = os.path.join(out_dir, 'gsea_result.tsv') gs_res.res2d.to_csv(out_path, sep='\t')
def test(): classfile = 'GSEApy/data/P53.cls' # 50 2 1 # #MUT WT # MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT \ # MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT WT WT WT WT WT WT WT WT WT \ # WT WT WT WT WT WT WT WT geneexpfile = "GSEApy/data/P53_resampling_data.txt" # NAME 786-0 BT-549 CCRF-CEM COLO 205 EKVX HCC-2998 HCT-15 \ # 0 CTLA2B 111.19 86.22 121.85 75.19 208.62 130.59 124.72 # 1 SCARA3 460.30 558.34 183.55 37.29 158.00 43.61 80.83 # 2 LOC100044683 97.25 118.94 81.17 119.51 119.88 107.73 165.57 # 3 CMBL 33.45 55.10 221.67 50.30 35.12 75.70 84.01 # 4 CLIC6 35.75 41.26 63.04 219.86 42.53 54.19 86.98 phenoA, phenoB, class_vector = gp.parser.gsea_cls_parser(classfile) gene_exp = pd.read_table(geneexpfile) # gene_exp.head() gs_res = gp.gsea( data=gene_exp, gene_sets='KEGG_2016', # cls=class_vector, cls=['Control' for i in range(25)] + ['Drug Treatment' for i in range(25)], permutation_type='phenotype', outdir='output', method='signal_to_noise', format='png') gsea_results = gs_res.res2d # gs_res.res2d.head() with plt.style.context('ggplot'): gsea_results = gsea_results.reset_index() gsea_results.head(5).plot.barh(y='fdr', x='Term', fontsize=10) plt.savefig('figure-gsea.pdf')
def gsea_enrichr(diff, treat, ctrl, log2fc, padj, go): # python code import os, errno from pandas import read_excel import gseapy as gp #outputfile name outGSEAname = "%s_vs_%s" % (treat, ctrl) #treat, ctrl =outGSEAname.split("_vs_") #parse blacklist and skip no significant results if os.path.isfile("temp/blacklist.txt"): with open("temp/blacklist.txt") as black: blacklist = [bla.strip().split("/")[-1] for bla in black] # handle files with no significant genes bk = diff.split("/")[-1] if bk in blacklist: print("Skip GSEA and Enrichr Procedure for %s vs %s." % (treat, ctrl)) for domain in go: #touch gsea output outfile1 = "differential_expression/GO/GSEA_%s/%s/gseapy.gsea.gene_sets.report.csv" % ( outGSEAname, domain) os.makedirs("differential_expression/GO/GSEA_%s/%s".format( outGSEAname, domain), exist_ok=True) os.system("touch %s" % outfile1) #toutch Enrichr output # for gl_type in ['all','up','down']: # touchdirs = "GO/Enrichr_{n}/{d}_{t}".format(n=outGSEAname, d=domain, t=gl_type) # os.makedirs(touchdirs, exist_ok=True) # outfile2='{n}/{d}.{t}.enrichr.reports.txt'.format(n=touchdirs, d=domain, t=gl_type) # os.system("touch %s"%outfile2) return #start to parse significant results al_res = read_excel(diff, sheet_name=None) sig_deg = al_res["sig-all.log2fc%s-padj%s" % (log2fc, padj)] sig_deg_up = al_res['sig-up'] sig_deg_dw = al_res['sig-down'] degs_sig = [ deg.gene_name.squeeze() for deg in [sig_deg, sig_deg_up, sig_deg_dw] ] sig_deg_gsea = sig_deg[['gene_name', 'log2FoldChange']] sig_deg_gsea_sort = sig_deg_gsea.sort_values('log2FoldChange', ascending=False) sig_deg_gsea_sort = sig_deg_gsea_sort.reset_index(drop=True) #dir for blacklist os.makedirs("temp/blacklist.GO", exist_ok=True) # enrichr and gsea start for glist, gl_type in zip(degs_sig, ['all', 'up', 'down']): outdir = 'differential_expression/GO/Enrichr_%s/%s' % (outGSEAname, gl_type) outfile = "{o}/{t}.enrichr.reports.txt".format(o=outdir, t=gl_type) # skip plotting while file exists if os.path.isfile(outfile): continue try: res_enr = gp.enrichr(gene_list=glist, gene_sets=go, description=gl_type, cutoff=0.1, outdir=outdir) except Exception as e: log1 = "Enrichr Server No response: %s vs %s, %s \n" % ( treat, ctrl, gl_type, ) log2 = "the lenght of input gene list = %s \n" % (len(glist)) print(log1, log2) # touch file error exists #os.system("touch %s"%outfile) with open( "temp/blacklist.GO/blacklist.enrichr.degs.%s_vs_%s.txt" % (treat, ctrl), 'a') as black: black.write(log1) black.write(log2) #run prerank """ for domain in go: try: outdir="GO/GSEA_prerank_%s/%s"%(outGSEAname, domain) gp.prerank(rnk=sig_deg_gsea_sort, gene_sets=domain, pheno_pos=treat, pheno_neg=ctrl, min_size=15, max_size=500, outdir=outdir) except: print("Oops...%s_vs_%s: skip GSEA plotting for %s, please adjust paramters for GSEA input."%(treat, ctrl, domain)) """ #select columns for gsea cols_ = [col for col in sig_deg.columns if col.startswith("TPM")] cols_group = [col.lstrip("TPM.") for col in cols_] cols = [col for col, group in zip(cols_, cols_group) if group.startswith(treat)] +\ [col for col, group in zip(cols_, cols_group) if group.startswith(ctrl)] col2 = ['gene_name'] + cols cls_vec = [treat for group in cols_group if group.startswith(treat)] +\ [ctrl for group in cols_group if group.startswith(ctrl)] # run gsea for domain in go: outdir = "differential_expression/GO/GSEA_%s/%s" % (outGSEAname, domain) outfile = "%s/gseapy.gsea.gene_sets.report.csv" % outdir #skip plotting while file exists if os.path.isfile(outfile): continue try: gs = gp.gsea(data=sig_deg[col2], gene_sets=domain, cls=cls_vec, min_size=15, max_size=500, outdir=outdir) except: log1 = "Oops...%s_vs_%s: skip GSEA plotting for %s, please adjust paramters for GSEA input.\n" % ( treat, ctrl, domain) log2 = "the lenght of input degs = %s \n" % sig_deg[col2].shape[0] print(log1, log2) os.system("touch %s/gseapy.gsea.gene_sets.report.csv" % outdir) with open( "temp/blacklist.GO/blacklist.gsea.degs.%s_vs_%s.txt" % (treat, ctrl), 'a') as black: black.write(log1) black.write(log2) return
def set_c(input_data, group_info, group_samples, session_key): def fet_f(a1, b1, total_gene_assum): a1_inter_b1 = list(set(a1).intersection(b1)) a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1)) b1_unique_froma1 = list(set(b1) - set(a1_inter_b1)) oddsratio, pvalue = stats.fisher_exact( [[len(a1_inter_b1), len(b1_unique_froma1)], [ len(a1_unique_fromb1), total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) + len(a1_unique_fromb1)) ]]) return len(a1), len(a1_inter_b1), pvalue plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/" template_plot_path = "images/" + session_key + "/" test_name = "USER_SET" if not os.path.exists(plot_path): os.mkdir(plot_path) else: files = glob.glob(plot_path + "*") for f in files: os.remove(f) filecount = 0 file_list = [] pt = MY_PLOT() df_all = all_expr_df #############GSEA############# gmt_temp = 'USER_SET\tNA\t' + '\t'.join(input_data) #fixed_path_gmt = 'user_data/'+userID+'/user.gmt' fixed_path_gmt = plot_path + 'user.gmt' rw = open(fixed_path_gmt, 'w') rw.write(gmt_temp) rw.close() sample_list = group_samples class_vector = [[group_info[i]] * len(item) for i, item in enumerate(sample_list)] class_vector = [y for x in class_vector for y in x] class_vector = map(str, class_vector) current_task.update_state(state='PROGRESS', meta={'process_percent3': 20}) df_user_s = [df_all[s] for s in sample_list] df_user_s = pd.concat(df_user_s, axis=1) df_user_s.columns = range(len(df_user_s.columns.tolist())) df_user_s = df_user_s.reset_index() gseapy.gsea(data=df_user_s, gene_sets=fixed_path_gmt, cls=class_vector, outdir=plot_path, min_size=2, max_size=1000, weighted_score_type=1, permutation_type='gene_set', method='signal_to_noise', ascending=False, figsize=(6.5, 6), format='png') file_list.append(template_plot_path + "USER_SET.gsea.png") filecount += 1 #############GSEA############# current_task.update_state(state='PROGRESS', meta={'process_percent3': 40}) #############MRA############# mra_set_t = mra_set.T mra_list = list(set(mra_set.index.tolist())) mra_targets = [mra_set_t[x].values.tolist() for x in mra_list] mra_targets = [ map(str, x[0]) if type(x[0]) == list else [str(x[0])] for x in mra_targets ] total_genes = len(list(set(mra_set.values.flatten()))) pvals = [ fet_f(mra_targets[a], input_data, total_genes) for a in range(len(mra_list)) ] pvals_list = [[mra_list[i], item[0], item[1], item[2]] for i, item in enumerate(pvals) if item[2] < 0.01 and item[0] > 10] table_arr = pvals_list #####Table data current_task.update_state(state='PROGRESS', meta={'process_percent3': 60}) rw = open(plot_path + 'mra_candidates.tsv', "w") rw.write("Gene(EntrezID)\tTF_targets\tMapped_genes\tP-value\n") for x in table_arr: x = [str(y) for y in x] rw.write('\t'.join(x) + '\n') rw.close() network_data = pd.DataFrame(data=pvals_list, columns=['TF', 'targets', 'mapped', 'pval']) network_data = network_data.set_index('TF') network_data[ 'prob_mapped'] = network_data['mapped'] / network_data['targets'] network_data = network_data.sort_values('prob_mapped', ascending=False) network_data = network_data.loc[network_data.index.tolist()[:10]] current_task.update_state(state='PROGRESS', meta={'process_percent': 80}) selected_network_expr = df_all[group_samples[0]].loc[ network_data.index.tolist()[:10]] pt.network_plot(network_data, selected_network_expr, tit=group_info[0], filename=plot_path + test_name + str(filecount)) file_list.append(template_plot_path + test_name + str(filecount) + ".png") current_task.update_state(state='PROGRESS', meta={'process_percent3': 100}) #############MRA############# return random.random()
async def GSEAonExperiments(data, experiments, res={}, savename='', scaling=[], geneset='GO_Biological_Process_2015', cores=8, cleanfunc=lambda i: i.split('(GO')[0]): """ Will run GSEA on a set of experiment Args: ----- data: a pandas.df rows: gene counts; columns: [experimentA_X,..., experimentD_X..., control_X] where X is the replicate number experiments: a list of experiment names (here experimentA,.. experimentD) scaling: a dict(experiment:(mean,std)) of scaling factors and their associated standard error for each experiments res: you can provide a dict containing results from savename: if you want to save the plots as pdfs, provides a location/name geneset: the geneset to run it on. (can be a filepath to your own geneset) cores: to run GSEA on cleanfunc: a func applied to the names of the gene sets to change it in some way (often to make it more readable) Returns ------- plots the results 1: returns a matrix with the enrichment for each term for each experiment 2: returns a dict(experiment:pd.df) with dataframe being the output of GSEA (with pvalues etc..) for each experiments """ for i, val in enumerate(experiments): print(val) totest = data[[ v for v in data.columns[:-1] if val + '-' in v or 'AAVS1' in v ]] cls = [ 'Condition' if val + '-' in v else 'DMSO' for v in totest.columns ] if scaling: if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_') [1]][1]: print("rescaling this one") cols = [i for i in totest.columns if val + '-' in i] totest[cols] = totest[cols] * \ (2**scaling[val.split('_')[1]][0]) if val in res: print(val + " is already in set") continue res[val] = gseapy.gsea(data=totest, gene_sets=geneset, cls=cls, no_plot=False, processes=cores) res[val].res2d['Term'] = [i for i in res[val].res2d.index] for i, v in res.items(): res[i].res2d['Term'] = [cleanfunc(i) for i in v.res2d['Term']] plt.figure(i) sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term", hue_order="geneset_size").set_title(val) a = set() for k, val in res.items(): a.update(set(val.res2d.Term)) a = {i: [0] * len(res) for i in a} for n, (k, val) in enumerate(res.items()): for i, v in val.res2d.iterrows(): a[v.Term][n] = v.es pres = pd.DataFrame(a, index=res.keys()) a = sns.clustermap(figsize=(25, 20), data=res, vmin=-1, vmax=1, yticklabels=res.index, cmap=plt.cm.RdYlBu) b = sns.clustermap(-res.T.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1) if savename: res.to_csv(savename + ".csv") a.savefig(savename + "_genesets.pdf") b.savefig(savename + "_correlation.pdf") return pres, res
'Pierre_sets': 'tracks/GSEA_gene_sets/Pierre_gene_sets.gmt', 'Pierre_sets_TLX_enh_TSS': 'tracks/GSEA_gene_sets/Pierre_gene_sets_plus.gmt', 'Pierre_sets_v2': 'tracks/GSEA_gene_sets/Pierre_gene_sets_v2.gmt' } #~ for g_set in gs_dic.keys(): for g_set in ['Pierre_sets_v2']: out_dir = 'GSEA/TLX3vsRAG_' + g_set + '_classic_std' gs_res = gp.gsea( data=tbn, gene_sets=gs_dic[g_set], weighted_score_type=0, #~ method = 'ratio_of_classes', min_size=10, max_size=10000, graph_num=150, permutation_type='gene_set', outdir=out_dir, cls=classes) # plotting gsea_results = gs_res.res2d with plt.style.context('ggplot'): gsea_results = gsea_results.reset_index() gsea_results.head(40).plot.barh(y='fdr', x='Term', figsize=(18, 6), fontsize=10) plt.gca().invert_yaxis()
def set_calculator(input_data, group_info, group_samples, session_key, set_name='USER_GENE_SET'): def fet_f(a1, b1, total_gene_assum): a1_inter_b1 = list(set(a1).intersection(b1)) a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1)) b1_unique_froma1 = list(set(b1) - set(a1_inter_b1)) oddsratio, pvalue = stats.fisher_exact( [[len(a1_inter_b1), len(b1_unique_froma1)], [ len(a1_unique_fromb1), total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) + len(a1_unique_fromb1)) ]]) return len(a1), len(a1_inter_b1), pvalue plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/" nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/" template_plot_path = "images/" + session_key + "/" test_name = "USER_SET" if not os.path.exists(plot_path): os.mkdir(plot_path) os.mkdir(nginx_plot_path) else: files = glob.glob(plot_path + "*") for f in files: os.remove(f) files = glob.glob(nginx_plot_path + "*") for f in files: os.remove(f) filecount = 0 file_list = [] pt = MY_PLOT() df_all = all_expr_df gsea_mapping_rate = len( list(set(input_data).intersection(df_all.index.tolist()))) list_numb = len(input_data) gsea_mapping_rate = float(gsea_mapping_rate) / float(list_numb) * 100 gsea_mapping_rate = "%.2f" % gsea_mapping_rate #############GSEA############# gmt_temp = set_name + '\tNA\t' + '\t'.join(input_data) #fixed_path_gmt = 'user_data/'+userID+'/user.gmt' fixed_path_gmt = plot_path + 'user.gmt' rw = open(fixed_path_gmt, 'w') rw.write(gmt_temp) rw.close() sample_list = group_samples class_vector = [[group_info[i]] * len(item) for i, item in enumerate(sample_list)] class_vector = [y for x in class_vector for y in x] class_vector = map(str, class_vector) df_user_s = [df_all[s] for s in sample_list] df_user_s = pd.concat(df_user_s, axis=1) df_user_s.columns = range(len(df_user_s.columns.tolist())) df_user_s = df_user_s.reset_index() gsea_result = gseapy.gsea(data=df_user_s, gene_sets=fixed_path_gmt, cls=class_vector, outdir=plot_path, min_size=2, max_size=1000, weighted_score_type=1, permutation_type='phenotype', method='signal_to_noise', ascending=False, figsize=(6.5, 6), format='png') #ledge_genes = gsea_result.results[gsea_result.results.keys()[0]]['ledge_genes'].split(";")## leading edge subset pval_es = gsea_result.results[gsea_result.results.keys()[0]][ 'pval'] ###GSEA p-value #with Image(filename=plot_path+set_name+".gsea.pdf", resolution=300) as img: # with Image(width=img.width, height=img.height, background=Color("white")) as bg: # bg.composite(img,0,0) # bg.save(filename=plot_path+set_name+".gsea.png") if pval_es <= 0.05: file_list.append(template_plot_path + set_name + ".gsea.png") filecount += 1 else: file_list.append(" ") gsea_mapping_rate = 'Not Applicable' filecount += 1 #############GSEA############# fold_change = all_expr_df[sample_list[0]].median( axis=1) - all_expr_df[sample_list[1]].median(axis=1) if pval_es <= 0.05: #############MRA############# mra_set_t = mra_set.T mra_list = list(set(mra_set.index.tolist())) mra_targets = [mra_set_t[x].values.tolist() for x in mra_list] mra_targets = [ map(str, x[0]) if type(x[0]) == list else [str(x[0])] for x in mra_targets ] total_genes = len(list(set(mra_set.values.flatten()))) pvals = [ fet_f(mra_targets[a], input_data, total_genes) for a in range(len(mra_list)) ] #pvals = [fet_f(mra_targets[a], ledge_genes, total_genes) for a in range(len(mra_list))] ## leading edge subset pvals_list = [[ mra_list[i], int(item[0]), int(item[1]), float("{0:.4f}".format(item[2])), float("{0:.4f}".format(fold_change.loc[mra_list[i]])) ] for i, item in enumerate(pvals) if item[2] < 0.01 and fold_change.loc[mra_list[i]] >= 0.1 and item[0] > 10] #table_arr = pvals_list #####Table data index_change = lambda x, y: [y] + x[1:] #table_arr = [index_change(x,pcta_id.loc[x[0]]['Symbol']) for x in table_arr] network_data = pd.DataFrame( data=pvals_list, columns=['TF', 'targets', 'mapped', 'pval', 'fc']) network_data = network_data.set_index('TF') network_data['Symbol'] = pcta_id.loc[map( str, network_data.index.tolist())]['Symbol'] network_data[ 'prob_mapped'] = network_data['mapped'] / network_data['targets'] network_data = network_data.sort_values('prob_mapped', ascending=False) network_data = network_data.loc[network_data.index.tolist()[:10]] network_data = network_data.round(4) network_data['targets'].astype(int) network_data['mapped'].astype(int) selected_network_expr = df_all[group_samples[0]].loc[ network_data.index.tolist()[:10]] selected_network_expr['Symbol'] = pcta_id.loc[map( str, selected_network_expr.index.tolist())]['Symbol'] network_data = network_data.set_index('Symbol') selected_network_expr = selected_network_expr.set_index('Symbol') pt.network_plot(network_data, selected_network_expr, tit=group_info[0], filename=plot_path + test_name + str(filecount)) file_list.append(template_plot_path + test_name + str(filecount) + ".png") network_data = network_data[[ 'targets', 'mapped', 'prob_mapped', 'fc', 'pval' ]] int_ = lambda x: [int(x[0]), int(x[1])] + x[2:] table_arr = [[i] + int_(network_data.loc[i].tolist()) for i in network_data.index.tolist()] network_data.to_csv(plot_path + 'mra_candidates.csv') os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path)) #############MRA############# else: table_arr = [] file_list.append(" ") return file_list, table_arr, gsea_mapping_rate
import gseapy as gp import numpy as np from os.path import join import pandas as pd tbl = pd.read_table(join('tracks', 'TLX3vsRAG-results_genes.txt'), index_col=0) tbl = tbl[(tbl.padj < 0.05)].dropna() names = pd.read_table( "tracks/annot_tracks/references/mm9/mm9_EnsemblTransc_GeneNames.txt", index_col=0, header=0, names=['GeneID', 'TransID', 'Gene_name']) names = names.drop('TransID', axis=1).drop_duplicates() names = names.loc[tbl.index] tbn = pd.concat([names, tbl], axis=1) tbn = tbn.drop( ['baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj'], axis=1) tbn['Gene_name'] = tbn['Gene_name'].str.upper() tbn = tbn.reset_index(drop=True) ## ==== GSEAPY classes = ['RAG', 'RAG', 'RAG', 'TLX3', 'TLX3', 'TLX3'] g_set = 'Reactome_2013' gs_res = gp.gsea(data=tbn, gene_sets=g_set, outdir='_test', cls=classes)
# ... as a dictionary: gene set name -> gene set genesets = df_genesets.T.symbols.to_dict() # print("Gene sets:", genesets) # Gene set enrichment analysis # The gsea module produces GSEA results. (https://pypi.org/project/gseapy/) # http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Main_Page import gseapy print(F"Running GSEA on {len(df_cov2)} genes") # Rearrange columns to have SARS samples first, Mock samples second df_cov2 = df_cov2[list(calu3.sort_values().index)] gsea_res = gseapy.gsea(data=df_cov2, gene_sets=genesets, cls=list(~calu3), outdir=str(ROOT / "gsea"), min_size=2) df_gsea_results = pd.DataFrame( data={(gs, info['fdr'], info['nes'], info['es'], info['pval']) for (gs, info) in gsea_res.results.items()}, columns=["Geneset", "FDR", "Norm. score", "Score", "p-value"], ) df_gsea_results = df_gsea_results.sort_values("Norm. score", ascending=False) print(df_gsea_results.to_markdown()) # | | Geneset | FDR | Norm. score | Score | p-value | # |---:|:-------------------------------------------|---------:|--------------:|----------:|----------:| # | 4 | HALLMARK_IL6_JAK_STAT3_SIGNALING | 0.341983 | 1.43446 | 0.875 | 0.0584046 | # | 6 | HALLMARK_TNFA_SIGNALING_VIA_NFKB | 0.204814 | 1.39715 | 0.785688 | 0.103362 |