def run(self): """GSEA main procedure""" assert self.method in ['signal_to_noise', 's2n', 'abs_signal_to_noise', 'abs_s2n', 't_test', 'ratio_of_classes', 'diff_of_classes', 'log2_ratio_of_classes'] assert self.permutation_type in ["phenotype", "gene_set"] assert self.min_size <= self.max_size # Start Analysis self._logger.info("Parsing data files for GSEA.............................") # phenotype labels parsing phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes) # select correct expression genes and values. dat, cls_dict = self.load_data(cls_vector) # data frame must have length > 1 assert len(dat) > 1 # ranking metrics calculation. dat2 = ranking_metric(df=dat, method=self.method, pos=phenoPos, neg=phenoNeg, classes= cls_dict, ascending=self.ascending) self.ranking = dat2 # filtering out gene sets and build gene sets dictionary gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets) self._logger.info("%04d gene_sets used for further statistical testing....."% len(gmt)) self._logger.info("Start to run GSEA...Might take a while..................") # cpu numbers self._set_cores() # compute ES, NES, pval, FDR, RES dataset = dat if self.permutation_type =='phenotype' else dat2 gsea_results,hit_ind,rank_ES, subsets = gsea_compute_tensor(data=dataset, gmt=gmt, n=self.permutation_num, weighted_score_type=self.weighted_score_type, permutation_type=self.permutation_type, method=self.method, pheno_pos=phenoPos, pheno_neg=phenoNeg, classes=cls_vector, ascending=self.ascending, processes=self._processes, seed=self.seed) self._logger.info("Start to generate GSEApy reports and figures............") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type=self.permutation_type) # reorder datarame for heatmap self._heatmat(df=dat.loc[dat2.index], classes=cls_vector, pheno_pos=phenoPos, pheno_neg=phenoNeg) # Plotting if not self._noplot: self._plotting(rank_metric=dat2, results=self.results, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, pheno_pos=phenoPos, pheno_neg=phenoNeg) self._logger.info("Congratulations. GSEApy ran successfully.................\n") if self._outdir is None: self._tmpdir.cleanup() return
def run(self): """GSEA main procedure""" assert self.permutation_type in ["phenotype", "gene_set"] assert self.min_size <= self.max_size if isinstance(self.data, pd.DataFrame): df = self.data.copy() elif os.path.isfile(self.data): df = pd.read_table(self.data, comment='#') else: raise Exception('Error parsing gene expression dataframe!') sys.exit(1) #data frame must have lenght > 1 assert len(df) > 1 # creat output dirs mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #Start Analysis logger.info("Parsing data files for GSEA.............................") # phenotype labels parsing phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes) #select correct expression genes and values. dat = self.__drop_dat(df, cls_vector) #ranking metrics calculation. dat2 = ranking_metric(df=dat, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending) #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2['gene_name'].values) logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #cpu numbers self._set_cores() #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute( data=dat, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type=self.permutation_type, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending, seed=self.seed, processes=self._processes) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") #Plotting heat_dat = dat.loc[dat2.gene_name] self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module, data=heat_dat, classes=cls_vector, phenoPos=phenoPos, phenoNeg=phenoNeg) logger.info("Congratulations. GSEApy run successfully................") return