Ejemplo n.º 1
0
    def _get_pval_uncorr(self, study, log=sys.stdout):
        """Calculate the uncorrected pvalues for study items."""
        log.write(
            "Calculating uncorrected p-values using Fisher's exact test\n")
        results = []
        go2studyitems = get_terms(study, self.assoc, self.obo_dag)
        pop_n, study_n = self.pop_n, len(study)
        allterms = set(go2studyitems.keys() + self.go2popitems.keys())

        for term in allterms:
            study_items = go2studyitems.get(term, set())
            study_count = len(study_items)
            pop_items = self.go2popitems.get(term, set())
            pop_count = len(pop_items)
            p = fisher.pvalue_population(study_count, study_n, pop_count,
                                         pop_n)

            one_record = GOEnrichmentRecord(GO=term,
                                            p_uncorrected=p.two_tail,
                                            study_items=study_items,
                                            pop_items=pop_items,
                                            ratio_in_study=(study_count,
                                                            study_n),
                                            ratio_in_pop=(pop_count, pop_n))

            results.append(one_record)

        return results
Ejemplo n.º 2
0
def calc_qval(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500):
    """Generate p-value distribution for FDR based on resampling."""
    import fisher
    from goatools.ratio import count_terms
    print(("Generate p-value distribution for FDR "
           "based on resampling (this might take a while)"),
          file=sys.stderr)
    distribution = []
    for i in range(T):
        new_study = random.sample(pop, study_n)
        new_term_study = count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in list(new_term_study.items()):
            pop_count = term_pop[term]
            p = fisher.pvalue_population(study_count, study_n, pop_count,
                                         pop_n)
            if p.two_tail < smallest_p:
                smallest_p = p.two_tail

        distribution.append(smallest_p)
        if i % 10 == 0:
            print("Sample {0} / {1}: p-value {2}".\
                        format(i, T, smallest_p), file=sys.stderr)
    return distribution
Ejemplo n.º 3
0
def calc_qval(study_n, pop_n,
              pop, assoc, term_pop, obo_dag, T=500):
    """Generate p-value distribution for FDR based on resampling."""
    import fisher
    from goatools.ratio import count_terms
    print(("Generate p-value distribution for FDR "
           "based on resampling (this might take a while)"), file=sys.stderr)
    distribution = []
    for i in range(T):
        new_study = random.sample(pop, study_n)
        new_term_study = count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in list(new_term_study.items()):
            pop_count = term_pop[term]
            p = fisher.pvalue_population(study_count,
                                         study_n,
                                         pop_count,
                                         pop_n)
            if p.two_tail < smallest_p:
                smallest_p = p.two_tail

        distribution.append(smallest_p)
        if i % 10 == 0:
            print("Sample {0} / {1}: p-value {2}".\
                        format(i, T, smallest_p), file=sys.stderr)
    return distribution
Ejemplo n.º 4
0
    def _get_pval_uncorr(self, study, log=sys.stdout):
        """Calculate the uncorrected pvalues for study items."""
        log.write("Calculating uncorrected p-values using Fisher's exact test\n")
        results = []
        go2studyitems = get_terms(study, self.assoc, self.obo_dag)
        pop_n, study_n = self.pop_n, len(study)
        allterms = set(go2studyitems.keys() + self.go2popitems.keys())

        for term in allterms:
            study_items = go2studyitems.get(term, set())
            study_count = len(study_items)
            pop_items = self.go2popitems.get(term, set())
            pop_count = len(pop_items)
            p = fisher.pvalue_population(study_count, study_n,
                                         pop_count, pop_n)

            one_record = GOEnrichmentRecord(
                GO=term,
                p_uncorrected=p.two_tail,
                study_items=study_items,
                pop_items=pop_items,
                ratio_in_study=(study_count, study_n),
                ratio_in_pop=(pop_count, pop_n))

            results.append(one_record)
          
        return results
Ejemplo n.º 5
0
def calc_qval(study_count,
              study_n,
              pop_count,
              pop_n,
              pop,
              assoc,
              term_pop,
              obo_dag,
              T=500):
    # print(("Generate p-value distribution for FDR "
    #        "based on resampling (this might take a while)"), file=sys.stderr)
    #           Print statements are not so nice for notebooks.
    distribution = []
    for i in range(T):
        new_study = random.sample(pop, study_n)
        new_term_study = count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in list(new_term_study.items()):
            pop_count = term_pop[term]
            p = fisher.pvalue_population(study_count, study_n, pop_count,
                                         pop_n)
            if p.two_tail < smallest_p:
                smallest_p = p.two_tail

        distribution.append(smallest_p)
        # if i % 10  == 0:
        #     print("Sample {0} / {1}: p-value {2}".\
        #                 format(i, T, smallest_p), file=sys.stderr)
        # Not so nice to have so many print lines on a notebook
    return distribution
Ejemplo n.º 6
0
def rank(targets, pathways, allmembers, outname):
    P = len(allmembers)
    realtargets = targets.intersection(allmembers)
    Pn = len(realtargets)
    Pr = float(Pn) / P

    # score it all
    out = open(outname, 'w')

    for uid, name, members in pathways:
        pmembers = members.intersection(realtargets)
        pmember_names = [converter.handler.to_symbol(uid) for uid in pmembers]
        if pmembers == None or pmember_names == None:
            continue
        if None in pmember_names:
            pmember_names.remove(None)
        if None in pmembers:
            pmembers.remove(None)
        C = len(members)
        Cn = len(pmembers)
        score = fisher.pvalue_population(Cn, C, Pn, P).two_tail
        log_score = math.log(score, 10)
        if (float(Cn) / C) < Pr:
            log_score = -log_score
        #print((uid,C,Cn,log_score, name, ','.join(pmembers), ','.join(pmember_names)))
        out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                  (uid, C, Cn, log_score, name, ','.join(pmembers),
                   ','.join(pmember_names)))

    out.close()
Ejemplo n.º 7
0
    def run_study(self, study):
        results = self.results

        term_study = count_terms(study, self.assoc, self.obo_dag)

        pop_n, study_n = len(self.pop), len(study)

        # Init study_count and pop_count to handle empty sets
        study_count = pop_count = 0
        for term, study_count in list(term_study.items()):
            pop_count = self.term_pop[term]
            p = fisher.pvalue_population(study_count, study_n,
                                         pop_count, pop_n)

            one_record = GOEnrichmentRecord(
                id=term,
                p_uncorrected=p.two_tail,
                ratio_in_study=(study_count, study_n),
                ratio_in_pop=(pop_count, pop_n))

            results.append(one_record)

        # Calculate multiple corrections
        pvals = [r.p_uncorrected for r in results]
        all_methods = ("bonferroni", "sidak", "holm", "fdr")
        bonferroni, sidak, holm, fdr = None, None, None, None

        for method in self.methods:
            if method == "bonferroni":
                bonferroni = Bonferroni(pvals, self.alpha).corrected_pvals
            elif method == "sidak":
                sidak = Sidak(pvals, self.alpha).corrected_pvals
            elif method == "holm":
                holm = HolmBonferroni(pvals, self.alpha).corrected_pvals
            elif method == "fdr":
                # get the empirical p-value distributions for FDR
                p_val_distribution = calc_qval(study_count, study_n,
                                               pop_count, pop_n,
                                               self.pop, self.assoc,
                                               self.term_pop, self.obo_dag)
                fdr = FDR(p_val_distribution,
                          results, self.alpha).corrected_pvals
            else:
                raise Exception("multiple test correction methods must be "
                                "one of %s" % all_methods)

        all_corrections = (bonferroni, sidak, holm, fdr)

        for method, corrected_pvals in zip(all_methods, all_corrections):
            self.update_results(method, corrected_pvals)

        results.sort(key=lambda r: r.p_uncorrected)
        self.results = results

        for rec in results:
            # get go term for description and level
            rec.find_goterm(self.obo_dag)

        return results
Ejemplo n.º 8
0
    def run_study(self, study):
        results = self.results

        term_study = count_terms(study, self.assoc, self.obo_dag)

        pop_n, study_n = len(self.pop), len(study)

        # Init study_count and pop_count to handle empty sets
        study_count = pop_count = 0
        for term, study_count in list(term_study.items()):
            pop_count = self.term_pop[term]
            p = fisher.pvalue_population(study_count, study_n, pop_count,
                                         pop_n)

            one_record = GOEnrichmentRecord(id=term,
                                            p_uncorrected=p.two_tail,
                                            ratio_in_study=(study_count,
                                                            study_n),
                                            ratio_in_pop=(pop_count, pop_n))

            results.append(one_record)

        # Calculate multiple corrections
        pvals = [r.p_uncorrected for r in results]
        all_methods = ("bonferroni", "sidak", "holm", "fdr")
        bonferroni, sidak, holm, fdr = None, None, None, None

        for method in self.methods:
            if method == "bonferroni":
                bonferroni = Bonferroni(pvals, self.alpha).corrected_pvals
            elif method == "sidak":
                sidak = Sidak(pvals, self.alpha).corrected_pvals
            elif method == "holm":
                holm = HolmBonferroni(pvals, self.alpha).corrected_pvals
            elif method == "fdr":
                # get the empirical p-value distributions for FDR
                p_val_distribution = calc_qval(study_count, study_n, pop_count,
                                               pop_n, self.pop, self.assoc,
                                               self.term_pop, self.obo_dag)
                fdr = FDR(p_val_distribution, results,
                          self.alpha).corrected_pvals
            else:
                raise Exception("multiple test correction methods must be "
                                "one of %s" % all_methods)

        all_corrections = (bonferroni, sidak, holm, fdr)

        for method, corrected_pvals in zip(all_methods, all_corrections):
            self.update_results(method, corrected_pvals)

        results.sort(key=lambda r: r.p_uncorrected)
        self.results = results

        for rec in results:
            # get go term for description and level
            rec.find_goterm(self.obo_dag)

        return results
Ejemplo n.º 9
0
def calc_qval(study_count, study_n, pop_count, pop_n, pop, assoc, term_pop, obo_dag):
    print >>sys.stderr, ("generating p-value distribution for FDR " "calculation (this might take a while)")
    T = 1000  # number of samples
    distribution = []
    for i in xrange(T):
        new_study = random.sample(pop, study_n)
        new_term_study = go_enrichment.count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in new_term_study.items():
            pop_count = term_pop[term]
            p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n)
            if p.two_tail < smallest_p:
                smallest_p = p.two_tail

        distribution.append(smallest_p)
        print >>sys.stderr, i, smallest_p
    return distribution
Ejemplo n.º 10
0
def calc_qval(study_count, study_n, pop_count, pop_n, pop, assoc, term_pop, obo_dag):
    print >>sys.stderr, "generating p-value distribution for FDR calculation " \
            "(this might take a while)"
    T = 1000 # number of samples
    distribution = []
    for i in xrange(T):
        new_study = random.sample(pop, study_n)
        new_term_study = go_enrichment.count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in new_term_study.items():
            pop_count = term_pop[term]
            p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n)
            if p.two_tail < smallest_p: smallest_p = p.two_tail

        distribution.append(smallest_p)
        print >>sys.stderr, i, smallest_p
    return distribution
Ejemplo n.º 11
0
    def _get_pval_uncorr(self, study):
        """Calculate the uncorrected pvalues for study items."""
        results = []
        term_study = count_terms(study, self.assoc, self.obo_dag)
        pop_n, study_n = self.pop_n, len(study)
        allterms = set(term_study.keys() + self.term_pop.keys())

        for term in allterms:
            study_count = term_study.get(term, 0)
            pop_count = self.term_pop.get(term, 0)
            p = fisher.pvalue_population(study_count, study_n,
                                         pop_count, pop_n)

            one_record = GOEnrichmentRecord(
                id=term,
                p_uncorrected=p.two_tail,
                ratio_in_study=(study_count, study_n),
                ratio_in_pop=(pop_count, pop_n))

            results.append(one_record)
          
        return results
Ejemplo n.º 12
0
 def __init__(self, set_count, pop_count, set_size, pop_size, data):
     for k, v in locals().iteritems():
         setattr(self, k, v)
     self.pvalue = fisher.pvalue_population(set_count, set_size, pop_count,
                                            pop_size).right_tail