def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info, options): '''compute pairwise enrichment between sets. The purpose of this method is to find if there are categories that are differently enriched in a pair of gene lists. The appropriate test here is the Chi-Squared test. The assumption is that the background set is the same in all gene lists. The workflow is thus:: for each combination of two gene lists: for each GO category: get counts in foreground, total counts of foreground compute chi-square enrichment output save P-value apply fdr - output significant differences. ''' dicts = [dict(x) for x in results_per_genelist] PairResult = collections.namedtuple( "PairResult", "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description" ) outfile = getFileName(options, go=test_ontology, section='summary', set="pairs") outfile.write( "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n" ) results = [] total = len(dicts) * (len(dicts) - 1) / 2 iteration = 0 min_observed_counts = options.pairs_min_observed_counts for x, genelist1 in enumerate(sorted(dicts)): x_go_categories = set(genelist1.keys()) for y, genelist2 in enumerate(sorted(dicts[:x])): iteration += 1 if iteration % 10 == 0: E.info("iteration: %i/%i (%5.2f%%)" % (iteration, total, 100.0 * iteration / total)) y_go_categories = set(genelist2.keys()) shared = x_go_categories.intersection(y_go_categories) c = E.Counter() for category in shared: c.shared += 1 xx = genelist1[category] yy = genelist2[category] # discard all tests with few observations in the observed # counts if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts: c.skipped += 1 continue observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory) aa, bb, cc, dd = \ (xx.mSampleCountsCategory, yy.mSampleCountsCategory, xx.mSampleCountsTotal - xx.mSampleCountsCategory, yy.mSampleCountsTotal - yy.mSampleCountsCategory) if cc == dd == 0: c.skipped += 1 continue c.tested += 1 fisher, pvalue = scipy.stats.fisher_exact( numpy.array(((aa, bb), (cc, dd)))) if pvalue < 0.05: c.significant_pvalue += 1 else: c.insignificant_pvalue += 1 results.append( PairResult._make( (category, labels[x], labels[y], xx.mSampleCountsCategory, xx.mSampleCountsTotal, xx.mPValue, xx.mQValue, yy.mSampleCountsCategory, yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue, 1.0, go2info[category].mDescription))) outfile.write("\t".join( map(str, (labels[x], labels[y], len(x_go_categories), len(y_go_categories), c.shared, c.skipped, c.tested, c.significant_pvalue, c.insignicant_pvalue))) + "\n") if options.output_filename_pattern: outfile.close() if options.fdr: pvalues = [x.pvalue for x in results] if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in pvalues] qvalues = fdr_data.mQValues else: qvalues = R['p.adjust'](pvalues, method=options.qvalue_method) # update qvalues results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)] outfile = getFileName(options, go=test_ontology, section='pairs', set="pairs") outfile.write("\t".join(PairResult._fields) + "\n") for result in results: outfile.write("\t".join(map(str, result)) + "\n") if options.output_filename_pattern: outfile.close()
def computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info): pairs = sorted(go_results.mResults.items()) E.info("calculating the FDRs using method `%s`" % options.qvalue_method) samples = None observed_min_pvalues = [ min(x[1].mProbabilityOverRepresentation, x[1].mProbabilityUnderRepresentation) for x in pairs ] fdrs = {} method = options.qvalue_method if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(observed_min_pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(observed_min_pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues] for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" ####################################################################### # sampling # for each GO-category: # get maximum and minimum counts in x samples -> calculate minimum/maximum significance # get average and stdev counts in x samples -> calculate z-scores for # test set samples, simulation_min_pvalues = getSamples(gene2go, foreground, background, options, test_ontology, go2info) # compute P-values from sampling observed_min_pvalues.sort() observed_min_pvalues = numpy.array(observed_min_pvalues) sample_size = options.sample for k, v in pairs: if k in samples: s = samples[k] else: raise KeyError("category %s not in samples" % k) # calculate values for z-score if s.mStddev > 0: zscore = abs(float(v.mSampleCountsCategory) - s.mMean) / s.mStddev else: zscore = 0.0 ############################################################# # FDR: # For each p-Value p at node n: # a = average number of nodes in each simulation run with P-Value < p # this can be obtained from the array of all p-values and all nodes # simply divided by the number of samples. # aka: expfpos=experimental false positive rate # b = number of nodes in observed data, that have a P-Value of less than p. # aka: pos=positives in observed data # fdr = a/b pvalue = v.mPValue # calculate values for FDR: # nfdr = number of entries with P-Value better than node. a = 0 while a < len(simulation_min_pvalues) and \ simulation_min_pvalues[a] < pvalue: a += 1 a = float(a) / float(sample_size) b = 0 while b < len(observed_min_pvalues) and \ observed_min_pvalues[b] < pvalue: b += 1 if b > 0: fdr = min(1.0, float(a) / float(b)) else: fdr = 1.0 fdrs[k] = (fdr, a, b) else: qvalues = R['p.adjust'](observed_min_pvalues, method=options.qvalue_method) fdr_data = Stats.FDRResult() fdr_data.mQValues = list(qvalues) for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) return fdrs, samples, method