Exemple #1
0
    def runSamplesPermu(self, df, gmt=None):
        """Single Sample GSEA workflow with permutation procedure"""

        assert self.min_size <= self.max_size
        mkdirs(self.outdir)
        self.resultsOnSamples = OrderedDict()
        outdir = self.outdir
        # iter throught each sample
        for name, ser in df.iteritems():
            self.outdir = os.path.join(outdir, str(name))
            self._logger.info("Run Sample: %s " % name)
            mkdirs(self.outdir)
            # sort ranking values from high to low or reverse
            dat2 = ser.sort_values(ascending=self.ascending)
            # reset integer index, or caused unwanted problems
            # df.reset_index(drop=True, inplace=True)

            # compute ES, NES, pval, FDR, RES
            gsea_results, hit_ind, rank_ES, subsets = gsea_compute(
                data=dat2,
                n=self.permutation_num,
                gmt=gmt,
                weighted_score_type=self.weighted_score_type,
                permutation_type='gene_set',
                method=None,
                pheno_pos='',
                pheno_neg='',
                classes=None,
                ascending=self.ascending,
                processes=self._processes,
                seed=self.seed,
                single=True,
                scale=self.scale)

            # write file
            res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
            self._save_results(zipdata=res_zip,
                               outdir=self.outdir,
                               module=self.module,
                               gmt=gmt,
                               rank_metric=dat2,
                               permutation_type="gene_sets")
            self.resultsOnSamples[name] = self.res2d.es
            # plotting
            if self._noplot: continue
            self._logger.info("Plotting Sample: %s \n" % name)
            self._plotting(rank_metric=dat2,
                           results=self.results,
                           res2d=self.res2d,
                           graph_num=self.graph_num,
                           outdir=self.outdir,
                           figsize=self.figsize,
                           format=self.format,
                           module=self.module)

        # save es, nes to file
        self._save(outdir)

        return
Exemple #2
0
    def run(self):
        """GSEA prerank workflow"""

        assert self.min_size <= self.max_size
        mkdirs(self.outdir)
        logger = self._log_init(module=self.module,
                               log_level=logging.INFO if self.verbose else logging.WARNING)


        dat2 = self._rank_metric(self.rnk)
        assert len(dat2) > 1

        #cpu numbers
        self._set_cores()

        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")

        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size,
                              gene_list=dat2['gene_name'].values)
        logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))


        logger.info("Start to run GSEA...Might take a while..................")
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt,
                                                              weighted_score_type=self.weighted_score_type,
                                                              permutation_type='gene_set', method=None,
                                                              phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg,
                                                              classes=None, ascending=self.ascending, seed=self.seed,
                                                              processes=self._processes, prerank=True)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)

        self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module,
                                   gmt=gmt, rank_metric=dat2, permutation_type="gene_sets")

        #Plotting
        self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d,
                       graph_num=self.graph_num, outdir=self.outdir,
                       figsize=self.figsize, format=self.format, module=self.module)

        logger.info("Congratulations. GSEApy run successfully................")

        return
Exemple #3
0
    def run(self):
        """GSEA prerank workflow"""

        assert self.min_size <= self.max_size

        # parsing rankings
        dat2 = self._load_ranking(self.rnk)
        assert len(dat2) > 1

        # cpu numbers
        self._set_cores()
        # Start Analysis
        self._logger.info("Parsing data files for GSEA.............................")
        # filtering out gene sets and build gene sets dictionary
        gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets)

        self._logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))
        self._logger.info("Start to run GSEA...Might take a while..................")
        # compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt,
                                                              weighted_score_type=self.weighted_score_type,
                                                              permutation_type='gene_set', method=None,
                                                              pheno_pos=self.pheno_pos, pheno_neg=self.pheno_neg,
                                                              classes=None, ascending=self.ascending,
                                                              processes=self._processes, seed=self.seed)
        self._logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
        self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module,
                                   gmt=gmt, rank_metric=dat2, permutation_type="gene_sets")

        # Plotting
        if not self._noplot:
            self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d,
                           graph_num=self.graph_num, outdir=self.outdir,
                           figsize=self.figsize, format=self.format,
                           module=self.module, phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg)

        self._logger.info("Congratulations. GSEApy run successfully................\n")
        if self._outdir is None:
            self._tmpdir.cleanup()

        return
Exemple #4
0
    def run(self):
        """GSEA main procedure"""

        assert self.permutation_type in ["phenotype", "gene_set"]
        assert self.min_size <= self.max_size

        if isinstance(self.data, pd.DataFrame):
            df = self.data.copy()
        elif os.path.isfile(self.data):
            df = pd.read_table(self.data, comment='#')
        else:
            raise Exception('Error parsing gene expression dataframe!')
            sys.exit(1)
        #data frame must have lenght > 1
        assert len(df) > 1
        # creat output dirs
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)
        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")

        # phenotype labels parsing
        phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes)
        #select correct expression genes and values.
        dat = self.__drop_dat(df, cls_vector)
        #ranking metrics calculation.
        dat2 = ranking_metric(df=dat,
                              method=self.method,
                              phenoPos=phenoPos,
                              phenoNeg=phenoNeg,
                              classes=cls_vector,
                              ascending=self.ascending)

        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets,
                              min_size=self.min_size,
                              max_size=self.max_size,
                              gene_list=dat2['gene_name'].values)

        logger.info(
            "%04d gene_sets used for further statistical testing....." %
            len(gmt))
        logger.info("Start to run GSEA...Might take a while..................")
        #cpu numbers
        self._set_cores()
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute(
            data=dat,
            n=self.permutation_num,
            gmt=gmt,
            weighted_score_type=self.weighted_score_type,
            permutation_type=self.permutation_type,
            method=self.method,
            phenoPos=phenoPos,
            phenoNeg=phenoNeg,
            classes=cls_vector,
            ascending=self.ascending,
            seed=self.seed,
            processes=self._processes)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
        self._save_results(zipdata=res_zip,
                           outdir=self.outdir,
                           module=self.module,
                           gmt=gmt,
                           rank_metric=dat2,
                           permutation_type="gene_sets")

        #Plotting
        heat_dat = dat.loc[dat2.gene_name]
        self._plotting(rank_metric=dat2,
                       results=self.results,
                       res2d=self.res2d,
                       graph_num=self.graph_num,
                       outdir=self.outdir,
                       figsize=self.figsize,
                       format=self.format,
                       module=self.module,
                       data=heat_dat,
                       classes=cls_vector,
                       phenoPos=phenoPos,
                       phenoNeg=phenoNeg)

        logger.info("Congratulations. GSEApy run successfully................")

        return