def _get_pval_uncorr(self, study, log=sys.stdout): """Calculate the uncorrected pvalues for study items.""" log.write( "Calculating uncorrected p-values using Fisher's exact test\n") results = [] go2studyitems = get_terms(study, self.assoc, self.obo_dag) pop_n, study_n = self.pop_n, len(study) allterms = set(go2studyitems.keys() + self.go2popitems.keys()) for term in allterms: study_items = go2studyitems.get(term, set()) study_count = len(study_items) pop_items = self.go2popitems.get(term, set()) pop_count = len(pop_items) p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) one_record = GOEnrichmentRecord(GO=term, p_uncorrected=p.two_tail, study_items=study_items, pop_items=pop_items, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results
def calc_qval(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500): """Generate p-value distribution for FDR based on resampling.""" import fisher from goatools.ratio import count_terms print(("Generate p-value distribution for FDR " "based on resampling (this might take a while)"), file=sys.stderr) distribution = [] for i in range(T): new_study = random.sample(pop, study_n) new_term_study = count_terms(new_study, assoc, obo_dag) smallest_p = 1 for term, study_count in list(new_term_study.items()): pop_count = term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) if p.two_tail < smallest_p: smallest_p = p.two_tail distribution.append(smallest_p) if i % 10 == 0: print("Sample {0} / {1}: p-value {2}".\ format(i, T, smallest_p), file=sys.stderr) return distribution
def _get_pval_uncorr(self, study, log=sys.stdout): """Calculate the uncorrected pvalues for study items.""" log.write("Calculating uncorrected p-values using Fisher's exact test\n") results = [] go2studyitems = get_terms(study, self.assoc, self.obo_dag) pop_n, study_n = self.pop_n, len(study) allterms = set(go2studyitems.keys() + self.go2popitems.keys()) for term in allterms: study_items = go2studyitems.get(term, set()) study_count = len(study_items) pop_items = self.go2popitems.get(term, set()) pop_count = len(pop_items) p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) one_record = GOEnrichmentRecord( GO=term, p_uncorrected=p.two_tail, study_items=study_items, pop_items=pop_items, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results
def calc_qval(study_count, study_n, pop_count, pop_n, pop, assoc, term_pop, obo_dag, T=500): # print(("Generate p-value distribution for FDR " # "based on resampling (this might take a while)"), file=sys.stderr) # Print statements are not so nice for notebooks. distribution = [] for i in range(T): new_study = random.sample(pop, study_n) new_term_study = count_terms(new_study, assoc, obo_dag) smallest_p = 1 for term, study_count in list(new_term_study.items()): pop_count = term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) if p.two_tail < smallest_p: smallest_p = p.two_tail distribution.append(smallest_p) # if i % 10 == 0: # print("Sample {0} / {1}: p-value {2}".\ # format(i, T, smallest_p), file=sys.stderr) # Not so nice to have so many print lines on a notebook return distribution
def rank(targets, pathways, allmembers, outname): P = len(allmembers) realtargets = targets.intersection(allmembers) Pn = len(realtargets) Pr = float(Pn) / P # score it all out = open(outname, 'w') for uid, name, members in pathways: pmembers = members.intersection(realtargets) pmember_names = [converter.handler.to_symbol(uid) for uid in pmembers] if pmembers == None or pmember_names == None: continue if None in pmember_names: pmember_names.remove(None) if None in pmembers: pmembers.remove(None) C = len(members) Cn = len(pmembers) score = fisher.pvalue_population(Cn, C, Pn, P).two_tail log_score = math.log(score, 10) if (float(Cn) / C) < Pr: log_score = -log_score #print((uid,C,Cn,log_score, name, ','.join(pmembers), ','.join(pmember_names))) out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (uid, C, Cn, log_score, name, ','.join(pmembers), ','.join(pmember_names))) out.close()
def run_study(self, study): results = self.results term_study = count_terms(study, self.assoc, self.obo_dag) pop_n, study_n = len(self.pop), len(study) # Init study_count and pop_count to handle empty sets study_count = pop_count = 0 for term, study_count in list(term_study.items()): pop_count = self.term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) one_record = GOEnrichmentRecord( id=term, p_uncorrected=p.two_tail, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) # Calculate multiple corrections pvals = [r.p_uncorrected for r in results] all_methods = ("bonferroni", "sidak", "holm", "fdr") bonferroni, sidak, holm, fdr = None, None, None, None for method in self.methods: if method == "bonferroni": bonferroni = Bonferroni(pvals, self.alpha).corrected_pvals elif method == "sidak": sidak = Sidak(pvals, self.alpha).corrected_pvals elif method == "holm": holm = HolmBonferroni(pvals, self.alpha).corrected_pvals elif method == "fdr": # get the empirical p-value distributions for FDR p_val_distribution = calc_qval(study_count, study_n, pop_count, pop_n, self.pop, self.assoc, self.term_pop, self.obo_dag) fdr = FDR(p_val_distribution, results, self.alpha).corrected_pvals else: raise Exception("multiple test correction methods must be " "one of %s" % all_methods) all_corrections = (bonferroni, sidak, holm, fdr) for method, corrected_pvals in zip(all_methods, all_corrections): self.update_results(method, corrected_pvals) results.sort(key=lambda r: r.p_uncorrected) self.results = results for rec in results: # get go term for description and level rec.find_goterm(self.obo_dag) return results
def run_study(self, study): results = self.results term_study = count_terms(study, self.assoc, self.obo_dag) pop_n, study_n = len(self.pop), len(study) # Init study_count and pop_count to handle empty sets study_count = pop_count = 0 for term, study_count in list(term_study.items()): pop_count = self.term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) one_record = GOEnrichmentRecord(id=term, p_uncorrected=p.two_tail, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) # Calculate multiple corrections pvals = [r.p_uncorrected for r in results] all_methods = ("bonferroni", "sidak", "holm", "fdr") bonferroni, sidak, holm, fdr = None, None, None, None for method in self.methods: if method == "bonferroni": bonferroni = Bonferroni(pvals, self.alpha).corrected_pvals elif method == "sidak": sidak = Sidak(pvals, self.alpha).corrected_pvals elif method == "holm": holm = HolmBonferroni(pvals, self.alpha).corrected_pvals elif method == "fdr": # get the empirical p-value distributions for FDR p_val_distribution = calc_qval(study_count, study_n, pop_count, pop_n, self.pop, self.assoc, self.term_pop, self.obo_dag) fdr = FDR(p_val_distribution, results, self.alpha).corrected_pvals else: raise Exception("multiple test correction methods must be " "one of %s" % all_methods) all_corrections = (bonferroni, sidak, holm, fdr) for method, corrected_pvals in zip(all_methods, all_corrections): self.update_results(method, corrected_pvals) results.sort(key=lambda r: r.p_uncorrected) self.results = results for rec in results: # get go term for description and level rec.find_goterm(self.obo_dag) return results
def calc_qval(study_count, study_n, pop_count, pop_n, pop, assoc, term_pop, obo_dag): print >>sys.stderr, ("generating p-value distribution for FDR " "calculation (this might take a while)") T = 1000 # number of samples distribution = [] for i in xrange(T): new_study = random.sample(pop, study_n) new_term_study = go_enrichment.count_terms(new_study, assoc, obo_dag) smallest_p = 1 for term, study_count in new_term_study.items(): pop_count = term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) if p.two_tail < smallest_p: smallest_p = p.two_tail distribution.append(smallest_p) print >>sys.stderr, i, smallest_p return distribution
def calc_qval(study_count, study_n, pop_count, pop_n, pop, assoc, term_pop, obo_dag): print >>sys.stderr, "generating p-value distribution for FDR calculation " \ "(this might take a while)" T = 1000 # number of samples distribution = [] for i in xrange(T): new_study = random.sample(pop, study_n) new_term_study = go_enrichment.count_terms(new_study, assoc, obo_dag) smallest_p = 1 for term, study_count in new_term_study.items(): pop_count = term_pop[term] p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) if p.two_tail < smallest_p: smallest_p = p.two_tail distribution.append(smallest_p) print >>sys.stderr, i, smallest_p return distribution
def _get_pval_uncorr(self, study): """Calculate the uncorrected pvalues for study items.""" results = [] term_study = count_terms(study, self.assoc, self.obo_dag) pop_n, study_n = self.pop_n, len(study) allterms = set(term_study.keys() + self.term_pop.keys()) for term in allterms: study_count = term_study.get(term, 0) pop_count = self.term_pop.get(term, 0) p = fisher.pvalue_population(study_count, study_n, pop_count, pop_n) one_record = GOEnrichmentRecord( id=term, p_uncorrected=p.two_tail, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results
def __init__(self, set_count, pop_count, set_size, pop_size, data): for k, v in locals().iteritems(): setattr(self, k, v) self.pvalue = fisher.pvalue_population(set_count, set_size, pop_count, pop_size).right_tail