def run(self): """main replot function""" assert self.min_size <= self.max_size import glob from bs4 import BeautifulSoup #parsing files....... try: results_path = glob.glob(self.indir + '*/edb/results.edb')[0] rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0] except IndexError as e: logger.debug(e) logger.error("Could not locate GSEA files in the given directory!") sys.exit(1) #extract sample names from .cls file cls_path = glob.glob(self.indir + '*/edb/*.cls') if cls_path: phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results phenoPos, phenoNeg = '', '' #start reploting self.gene_sets = gene_set_path mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #obtain gene sets gene_set_dict = gsea_gmt_parser(gene_set_path, min_size=self.min_size, max_size=self.max_size) #obtain rank_metrics rank_metric = self._rank_metric(rank_path) correl_vector = rank_metric['rank'].values gene_list = rank_metric['gene_name'] #extract each enriment term in the results.edb files and plot. database = BeautifulSoup(open(results_path), features='xml') length = len(database.findAll('DTG')) for idx in range(length): #extract statistical resutls from results.edb file enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser( results_path, index=idx) gene_set = gene_set_dict.get(enrich_term) #calculate enrichment score RES = enrichment_score( gene_list=gene_list, gene_set=gene_set, weighted_score_type=self.weighted_score_type, correl_vector=correl_vector)[2] #plotting gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES, phenoPos, phenoNeg, self.figsize, self.format, self.outdir, self.module) logger.info( "Congratulations! Your plots have been reproduced successfully!")
def run(self): """main replot function""" assert self.min_size <= self.max_size assert self.fignum > 0 import glob from bs4 import BeautifulSoup # parsing files....... try: results_path = glob.glob(self.indir + '*/edb/results.edb')[0] rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0] except IndexError as e: sys.stderr.write( "Could not locate GSEA files in the given directory!") sys.exit(1) # extract sample names from .cls file cls_path = glob.glob(self.indir + '*/edb/*.cls') if cls_path: phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results phenoPos, phenoNeg = '', '' # start reploting self.gene_sets = gene_set_path # obtain gene sets gene_set_dict = self.parse_gmt(gmt=gene_set_path) # obtain rank_metrics rank_metric = self._load_ranking(rank_path) correl_vector = rank_metric.values gene_list = rank_metric.index.values # extract each enriment term in the results.edb files and plot. database = BeautifulSoup(open(results_path), features='xml') length = len(database.findAll('DTG')) fig_num = self.fignum if self.fignum <= length else length for idx in range(fig_num): # extract statistical resutls from results.edb file enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser( results_path, index=idx) gene_set = gene_set_dict.get(enrich_term) # calculate enrichment score RES = enrichment_score( gene_list=gene_list, correl_vector=correl_vector, gene_set=gene_set, weighted_score_type=self.weighted_score_type, nperm=0)[-1] # plotting gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES, phenoPos, phenoNeg, self.figsize, self.format, self.outdir, self.module) self._logger.info( "Congratulations! Your plots have been reproduced successfully!\n")
def run(self): """main replot function""" assert self.min_size <= self.max_size # parsing files....... try: results_path = glob.glob(self.indir+'*/edb/results.edb')[0] rank_path = glob.glob(self.indir+'*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir+'*/edb/gene_sets.gmt')[0] except IndexError as e: raise Exception("Could not locate GSEA files in the given directory!") # extract sample names from .cls file cls_path = glob.glob(self.indir+'*/edb/*.cls') if cls_path: pos, neg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results pos, neg = '','' # start reploting self.gene_sets = gene_set_path # obtain gene sets gene_set_dict = self.parse_gmt(gmt=gene_set_path) # obtain rank_metrics rank_metric = self._load_ranking(rank_path) correl_vector = rank_metric.values gene_list = rank_metric.index.values # extract each enriment term in the results.edb files and plot. database = gsea_edb_parser(results_path) for enrich_term, data in database.items(): # extract statistical resutls from results.edb file hit_ind, nes, pval, fdr = data gene_set = gene_set_dict.get(enrich_term) if float(pval) > 0.1: continue # calculate enrichment score RES = enrichment_score(gene_list=gene_list, correl_vector=correl_vector, gene_set=gene_set, weighted_score_type=self.weighted_score_type, nperm=0)[-1] # plotting term = enrich_term.replace('/','_').replace(":","_") outfile = '{0}/{1}.{2}.{3}'.format(self.outdir, term, self.module, self.format) gseaplot(rank_metric=rank_metric, term=enrich_term, hit_indices=hit_ind, nes=nes, pval=pval, fdr=fdr, RES=RES, pheno_pos=pos, pheno_neg=neg, figsize=self.figsize, ofname=outfile) self._logger.info("Congratulations! Your plots have been reproduced successfully!\n")
def gsea_signed_single(sig, values, midpoint=0, weighted_score_type=0): """ Compute signed GSEA on a single signature """ assert values.index.is_unique, "Index for values must be unique" v2 = values.copy() for gene in sig.values.index: if sig.values[gene] < 0 and gene in v2.index: v2[gene] = (v2[gene] - midpoint) * -1 + midpoint # convert signature from signed into gseapy format v2 = v2.sort_values(ascending=False) gene_list = v2.index.tolist() correl_vector = v2.values sig_genes = list(sig.genes) rs = np.random.RandomState(1028) # gsea es, esnull, ind, RES = enrichment_score( gene_list, gene_set=sig_genes, correl_vector=correl_vector, weighted_score_type=weighted_score_type, nperm=1000, rs=rs) # Add in the leading-edge genes LE_genes = [] if es > 0: ii = np.argmax(RES) for hi in ind: if hi > ii: break gene = gene_list[hi] if sig.values[gene] < 0: gene = "(" + gene + ")" LE_genes.append(gene) else: ii = np.argmin(RES) for hi in ind[::-1]: if hi < ii: break gene = gene_list[hi] if sig.values[gene] < 0: gene = "(" + gene + ")" LE_genes.append(gene) return es, ind, RES, esnull, LE_genes