Ejemplo n.º 1
0
    def run(self):
        """Single Sample GSEA workflow"""

        assert self.min_size <= self.max_size

        mkdirs(self.outdir)
        logger = self._log_init(module=self.module,
                               log_level=logging.INFO if self.verbose else logging.WARNING)


        dat = self._rank_metric(self.data)
        assert len(dat) > 1

        #cpu numbers
        self._set_cores()

        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")
        #select correct expression genes and values.

        dat2 = dat.set_index('gene_name')
        del dat2['rank2']
        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2.index.values)
        logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))

        logger.info("Start to run GSEA...Might take a while..................")
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss(data=dat2, n=self.permutation_num, gmt=gmt,
                                                                  weighted_score_type=self.weighted_score_type,
                                                                  seed=self.seed, processes=self._processes)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)

        self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module,
                                   gmt=gmt, rank_metric=dat, permutation_type="gene_sets")

        #Plotting
        self._plotting(rank_metric=dat, results=self.results, res2d=self.res2d,
                       graph_num=self.graph_num, outdir=self.outdir,
                       figsize=self.figsize, format=self.format, module=self.module)

        logger.info("Congratulations. GSEApy run successfully................")

        return
Ejemplo n.º 2
0
    def runSample(self, df, gmt=None):
        """Single Sample GSEA workflow"""

        assert self.min_size <= self.max_size

        mkdirs(self.outdir)
        #dat = self._rank_metric(df)
        #assert len(dat) > 1
        #Start Analysis
        self._logger.info(
            "Parsing data files for GSEA.............................")
        #select correct expression genes and values.
        if isinstance(df, pd.DataFrame):
            if df.shape[1] == 1:
                df = df.reset_index()
        elif isinstance(df, pd.Series):
            df = df.reset_index()
            #sort ranking values from high to low or reverse
            df.sort_values(by=df.columns[1],
                           ascending=self.ascending,
                           inplace=True)
            df.columns = ['gene_name', 'rank']
            df['rank2'] = df['rank']
        else:
            raise Exception('Error parsing gene ranking values!')
        # revmove rank2
        dat2 = df.set_index('gene_name')
        del dat2['rank2']
        #cpu numbers
        self._set_cores()
        #filtering out gene sets and build gene sets dictionary
        if gmt is None:
            gmt = gsea_gmt_parser(self.gene_sets,
                                  min_size=self.min_size,
                                  max_size=self.max_size,
                                  gene_list=dat2.index.values)
        self._logger.info(
            "%04d gene_sets used for further statistical testing....." %
            len(gmt))
        self._logger.info(
            "Start to run GSEA...Might take a while..................")
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss(
            data=dat2,
            n=self.permutation_num,
            gmt=gmt,
            weighted_score_type=self.weighted_score_type,
            seed=self.seed,
            processes=self._processes)

        self._logger.info(
            "Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)

        self._save_results(zipdata=res_zip,
                           outdir=self.outdir,
                           module=self.module,
                           gmt=gmt,
                           rank_metric=df,
                           permutation_type="gene_sets")

        #Plotting
        self._plotting(rank_metric=df,
                       results=self.results,
                       res2d=self.res2d,
                       graph_num=self.graph_num,
                       outdir=self.outdir,
                       figsize=self.figsize,
                       format=self.format,
                       module=self.module)

        self._logger.info(
            "Congratulations. GSEApy run successfully................")

        return