Esempio n. 1
0
def query_single_gset(query, all_genes=[], gene_sets=[]):
    res = calc_pvalues(query, gene_sets, background=all_genes)
    if len(res) == 0:
        return  0, None
    else:
        set_names, p_vals, overlap_size, gset_size, overlapped_genes = res
        # by default Benjamini-Hochberg
        q_vals, rejs = multiple_testing_correction(p_vals)
        #min_log_q_vals.append(np.log10(min(q_vals)))
        best_hit_ndx = np.argmin(q_vals)
        return -np.log10(q_vals[best_hit_ndx]), set_names[best_hit_ndx]
Esempio n. 2
0
    def enrich(self, gmt):
        """use local mode
         
        p = p-value computed using the Fisher exact test (Hypergeometric test)  

        Not implemented here:

            combine score = log(p)·z

        see here: http://amp.pharm.mssm.edu/Enrichr/help#background&q=4
        
        columns contain:
            
            Term Overlap P-value Adjusted_P-value Genes

        """
        if isscalar(self.background):
            if isinstance(self.background, int) or self.background.isdigit():
                self._bg = int(self.background)
            elif isinstance(self.background, str):
                # self.background = set(reduce(lambda x,y: x+y, gmt.values(),[]))
                self._bg = self.get_background()
                self._logger.info("Background: found %s genes" %
                                  (len(self._bg)))
            else:
                raise Exception("Unsupported background data type")
        else:
            # handle array object: nd.array, list, tuple, set, Series
            try:
                it = iter(self.background)
                self._bg = set(self.background)
            except TypeError:
                self._logger.error("Unsupported background data type")
        # statistical testing
        hgtest = list(
            calc_pvalues(query=self._gls, gene_sets=gmt, background=self._bg))
        if len(hgtest) > 0:
            terms, pvals, olsz, gsetsz, genes = hgtest
            fdrs, rej = multiple_testing_correction(
                ps=pvals, alpha=self.cutoff, method='benjamini-hochberg')
            # save to a dataframe
            odict = OrderedDict()
            odict['Term'] = terms
            odict['Overlap'] = list(
                map(lambda h, g: "%s/%s" % (h, g), olsz, gsetsz))
            odict['P-value'] = pvals
            odict['Adjusted P-value'] = fdrs
            # odict['Reject (FDR< %s)'%self.cutoff ] = rej
            odict['Genes'] = [";".join(g) for g in genes]
            res = pd.DataFrame(odict)
            return res
        return
Esempio n. 3
0
    def enrich(self, gmt):
        """use local mode
         
        p = p-value computed using the Fisher exact test (Hypergeometric test)  

        Not implemented here:

            combine score = log(p)·z

        see here: http://amp.pharm.mssm.edu/Enrichr/help#background&q=4
        
        columns contain:
            
            Term Overlap P-value Adjusted_P-value Genes

        """
        if isinstance(self.background, str):
            # self.background = set(reduce(lambda x,y: x+y, gmt.values(),[]))
            df = self.get_background()
            # input id type: entrez or gene_name
            if self._isezid:
                bg = df['entrezgene'].astype(int)
            else:
                bg = df['external_gene_name']

            self._bg = set(bg.unique())
            self._logger.warning("Background: %s %s genes with GO_IDs. " %
                                 (self._bg, self.background))
            self._logger.warning(
                "If this is not you wanted, please give a number to background argument"
            )
        hgtest = list(
            calc_pvalues(query=self._gls, gene_sets=gmt, background=self._bg))
        if len(hgtest) > 0:
            terms, pvals, olsz, gsetsz, genes = hgtest
            fdrs, rej = multiple_testing_correction(
                ps=pvals, alpha=self.cutoff, method='benjamini-hochberg')
            # save to a dataframe
            odict = OrderedDict()
            odict['Term'] = terms
            odict['Overlap'] = list(
                map(lambda h, g: "%s/%s" % (h, g), olsz, gsetsz))
            odict['P-value'] = pvals
            odict['Adjusted P-value'] = fdrs
            # odict['Reject (FDR< %s)'%self.cutoff ] = rej
            odict['Genes'] = [";".join(g) for g in genes]
            res = pd.DataFrame(odict)
            return res
        return
Esempio n. 4
0
def query_single_gset(query, all_genes=[], gene_sets=[]):
    res = calc_pvalues(query, gene_sets, background=all_genes)
    if len(res) == 0:
        return  0, None,0
    else:
        set_names, p_vals, overlap_size, gset_size, overlapped_genes = res
        # by default Benjamini-Hochberg
        q_vals, rejs = multiple_testing_correction(p_vals)
        #min_log_q_vals.append(np.log10(min(q_vals)))
        best_fc = 0
        best_hit_ndx = -1
        for i in range(0,len(q_vals)):
            if q_vals[i]<0.05:
                obs = float(overlap_size[i])
                expec = float(gset_size[i])*len(query)/len(all_genes)
                fc = obs/expec
                if fc > best_fc:
                    best_fc=fc
                    best_hit_ndx = i
        if best_hit_ndx == -1:
            return  0, None,0
        else:
            return -np.log10(q_vals[best_hit_ndx]), set_names[best_hit_ndx], np.log10(best_fc)