def runSamplesPermu(self, df, gmt=None): """Single Sample GSEA workflow with permutation procedure""" assert self.min_size <= self.max_size mkdirs(self.outdir) self.resultsOnSamples = OrderedDict() outdir = self.outdir # iter throught each sample for name, ser in df.iteritems(): self.outdir = os.path.join(outdir, str(name)) self._logger.info("Run Sample: %s " % name) mkdirs(self.outdir) # sort ranking values from high to low or reverse dat2 = ser.sort_values(ascending=self.ascending) # reset integer index, or caused unwanted problems # df.reset_index(drop=True, inplace=True) # compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute( data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type='gene_set', method=None, pheno_pos='', pheno_neg='', classes=None, ascending=self.ascending, processes=self._processes, seed=self.seed, single=True, scale=self.scale) # write file res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") self.resultsOnSamples[name] = self.res2d.es # plotting if self._noplot: continue self._logger.info("Plotting Sample: %s \n" % name) self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) # save es, nes to file self._save(outdir) return
def run(self): """GSEA prerank workflow""" assert self.min_size <= self.max_size mkdirs(self.outdir) logger = self._log_init(module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) dat2 = self._rank_metric(self.rnk) assert len(dat2) > 1 #cpu numbers self._set_cores() #Start Analysis logger.info("Parsing data files for GSEA.............................") #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2['gene_name'].values) logger.info("%04d gene_sets used for further statistical testing....."% len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #compute ES, NES, pval, FDR, RES gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type='gene_set', method=None, phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg, classes=None, ascending=self.ascending, seed=self.seed, processes=self._processes, prerank=True) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") #Plotting self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) logger.info("Congratulations. GSEApy run successfully................") return
def run(self): """GSEA prerank workflow""" assert self.min_size <= self.max_size # parsing rankings dat2 = self._load_ranking(self.rnk) assert len(dat2) > 1 # cpu numbers self._set_cores() # Start Analysis self._logger.info("Parsing data files for GSEA.............................") # filtering out gene sets and build gene sets dictionary gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets) self._logger.info("%04d gene_sets used for further statistical testing....."% len(gmt)) self._logger.info("Start to run GSEA...Might take a while..................") # compute ES, NES, pval, FDR, RES gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type='gene_set', method=None, pheno_pos=self.pheno_pos, pheno_neg=self.pheno_neg, classes=None, ascending=self.ascending, processes=self._processes, seed=self.seed) self._logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") # Plotting if not self._noplot: self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module, phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg) self._logger.info("Congratulations. GSEApy run successfully................\n") if self._outdir is None: self._tmpdir.cleanup() return
def run(self): """GSEA main procedure""" assert self.permutation_type in ["phenotype", "gene_set"] assert self.min_size <= self.max_size if isinstance(self.data, pd.DataFrame): df = self.data.copy() elif os.path.isfile(self.data): df = pd.read_table(self.data, comment='#') else: raise Exception('Error parsing gene expression dataframe!') sys.exit(1) #data frame must have lenght > 1 assert len(df) > 1 # creat output dirs mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #Start Analysis logger.info("Parsing data files for GSEA.............................") # phenotype labels parsing phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes) #select correct expression genes and values. dat = self.__drop_dat(df, cls_vector) #ranking metrics calculation. dat2 = ranking_metric(df=dat, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending) #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2['gene_name'].values) logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #cpu numbers self._set_cores() #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute( data=dat, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type=self.permutation_type, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending, seed=self.seed, processes=self._processes) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") #Plotting heat_dat = dat.loc[dat2.gene_name] self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module, data=heat_dat, classes=cls_vector, phenoPos=phenoPos, phenoNeg=phenoNeg) logger.info("Congratulations. GSEApy run successfully................") return