Python enrichment_score Examples, gseapy.algorithm.enrichment_score Python Examples

Example #1

0

Show file

    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size

        import glob
        from bs4 import BeautifulSoup

        #parsing files.......
        try:
            results_path = glob.glob(self.indir + '*/edb/results.edb')[0]
            rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0]
            gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            logger.debug(e)
            logger.error("Could not locate GSEA files in the given directory!")
            sys.exit(1)
        #extract sample names from .cls file
        cls_path = glob.glob(self.indir + '*/edb/*.cls')
        if cls_path:
            phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            phenoPos, phenoNeg = '', ''
        #start reploting
        self.gene_sets = gene_set_path
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)

        #obtain gene sets
        gene_set_dict = gsea_gmt_parser(gene_set_path,
                                        min_size=self.min_size,
                                        max_size=self.max_size)
        #obtain rank_metrics
        rank_metric = self._rank_metric(rank_path)
        correl_vector = rank_metric['rank'].values
        gene_list = rank_metric['gene_name']
        #extract each enriment term in the results.edb files and plot.
        database = BeautifulSoup(open(results_path), features='xml')
        length = len(database.findAll('DTG'))

        for idx in range(length):
            #extract statistical resutls from results.edb file
            enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser(
                results_path, index=idx)
            gene_set = gene_set_dict.get(enrich_term)
            #calculate enrichment score
            RES = enrichment_score(
                gene_list=gene_list,
                gene_set=gene_set,
                weighted_score_type=self.weighted_score_type,
                correl_vector=correl_vector)[2]
            #plotting
            gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES,
                      phenoPos, phenoNeg, self.figsize, self.format,
                      self.outdir, self.module)

        logger.info(
            "Congratulations! Your plots have been reproduced successfully!")

Example #2

0

Show file

    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size
        assert self.fignum > 0
        import glob
        from bs4 import BeautifulSoup

        # parsing files.......
        try:
            results_path = glob.glob(self.indir + '*/edb/results.edb')[0]
            rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0]
            gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            sys.stderr.write(
                "Could not locate GSEA files in the given directory!")
            sys.exit(1)
        # extract sample names from .cls file
        cls_path = glob.glob(self.indir + '*/edb/*.cls')
        if cls_path:
            phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            phenoPos, phenoNeg = '', ''
        # start reploting
        self.gene_sets = gene_set_path
        # obtain gene sets
        gene_set_dict = self.parse_gmt(gmt=gene_set_path)
        # obtain rank_metrics
        rank_metric = self._load_ranking(rank_path)
        correl_vector = rank_metric.values
        gene_list = rank_metric.index.values
        # extract each enriment term in the results.edb files and plot.
        database = BeautifulSoup(open(results_path), features='xml')
        length = len(database.findAll('DTG'))
        fig_num = self.fignum if self.fignum <= length else length
        for idx in range(fig_num):
            # extract statistical resutls from results.edb file
            enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser(
                results_path, index=idx)
            gene_set = gene_set_dict.get(enrich_term)
            # calculate enrichment score
            RES = enrichment_score(
                gene_list=gene_list,
                correl_vector=correl_vector,
                gene_set=gene_set,
                weighted_score_type=self.weighted_score_type,
                nperm=0)[-1]
            # plotting
            gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES,
                      phenoPos, phenoNeg, self.figsize, self.format,
                      self.outdir, self.module)

        self._logger.info(
            "Congratulations! Your plots have been reproduced successfully!\n")

Example #3

0

Show file

    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size

        # parsing files.......
        try:
            results_path = glob.glob(self.indir+'*/edb/results.edb')[0]
            rank_path =  glob.glob(self.indir+'*/edb/*.rnk')[0]
            gene_set_path =  glob.glob(self.indir+'*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            raise Exception("Could not locate GSEA files in the given directory!")
        # extract sample names from .cls file
        cls_path = glob.glob(self.indir+'*/edb/*.cls')
        if cls_path:
            pos, neg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            pos, neg = '',''
        # start reploting
        self.gene_sets = gene_set_path
        # obtain gene sets
        gene_set_dict = self.parse_gmt(gmt=gene_set_path)
        # obtain rank_metrics
        rank_metric = self._load_ranking(rank_path)
        correl_vector = rank_metric.values
        gene_list = rank_metric.index.values
        # extract each enriment term in the results.edb files and plot.
        
        database = gsea_edb_parser(results_path)
        for enrich_term, data in database.items():
            # extract statistical resutls from results.edb file
            hit_ind, nes, pval, fdr = data
            gene_set = gene_set_dict.get(enrich_term)
            if float(pval) > 0.1:
                continue
            # calculate enrichment score
            RES = enrichment_score(gene_list=gene_list, 
                                   correl_vector=correl_vector,
                                   gene_set=gene_set, 
                                   weighted_score_type=self.weighted_score_type,
                                   nperm=0)[-1]
            # plotting
            term = enrich_term.replace('/','_').replace(":","_")
            outfile = '{0}/{1}.{2}.{3}'.format(self.outdir, term, self.module, self.format)
            gseaplot(rank_metric=rank_metric, term=enrich_term, 
                         hit_indices=hit_ind, nes=nes, pval=pval, fdr=fdr, 
                         RES=RES, pheno_pos=pos, pheno_neg=neg, 
                         figsize=self.figsize, ofname=outfile)

        self._logger.info("Congratulations! Your plots have been reproduced successfully!\n")

Example #4

0

Show file

def gsea_signed_single(sig, values, midpoint=0, weighted_score_type=0):
    """
    Compute signed GSEA on a single signature
    """

    assert values.index.is_unique, "Index for values must be unique"

    v2 = values.copy()
    for gene in sig.values.index:
        if sig.values[gene] < 0 and gene in v2.index:
            v2[gene] = (v2[gene] - midpoint) * -1 + midpoint

    # convert signature from signed into gseapy format
    v2 = v2.sort_values(ascending=False)
    gene_list = v2.index.tolist()
    correl_vector = v2.values
    sig_genes = list(sig.genes)
    rs = np.random.RandomState(1028)

    # gsea
    es, esnull, ind, RES = enrichment_score(
        gene_list,
        gene_set=sig_genes,
        correl_vector=correl_vector,
        weighted_score_type=weighted_score_type,
        nperm=1000,
        rs=rs)

    # Add in the leading-edge genes
    LE_genes = []
    if es > 0:
        ii = np.argmax(RES)
        for hi in ind:
            if hi > ii:
                break
            gene = gene_list[hi]
            if sig.values[gene] < 0:
                gene = "(" + gene + ")"
            LE_genes.append(gene)
    else:
        ii = np.argmin(RES)
        for hi in ind[::-1]:
            if hi < ii:
                break
            gene = gene_list[hi]
            if sig.values[gene] < 0:
                gene = "(" + gene + ")"
            LE_genes.append(gene)

    return es, ind, RES, esnull, LE_genes