def computeFDR(all_results, qvalue_method="storey"): '''compute FDR. update GOResult structure with field .fdr ''' # flatten all_results results = [] for key, data in all_results.iteritems(): results.extend(data.mResults.values()) observed_min_pvalues = [ min(x.mProbabilityOverRepresentation, x.mProbabilityUnderRepresentation) for x in results ] if qvalue_method == "storey": # compute fdr via Storey's method fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1) E.info("estimated proportion of true null hypotheses = %6.4f" % fdr_data.mPi0) if fdr_data.mPi0 < 0.1: E.warn("estimated proportion of true null hypotheses is " "less than 10%% (%6.4f)" % fdr_data.mPi0) for result, qvalue in zip(results, fdr_data.mQValues): result.fdr = qvalue elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" raise NotImplementedError("empirical needs some work")
def checkFDR(self, pi0_method): result = Stats.doFDR(self.mPvalues, fdr_level=0.05, pi0_method=pi0_method) R("""require ('qvalue')""") qvalues = R.qvalue(ro.FloatVector(self.mPvalues), fdr_level=0.05, pi0_method=pi0_method) assert qvalues.names[1] == "pi0" assert qvalues.names[2] == "qvalues" assert qvalues.names[5] == "significant" assert qvalues.names[6] == "lambda" r_qvalues = qvalues[2] r_pi0 = qvalues[1][0] self.assertEqual(len(result.mQValues), len(qvalues[2])) self.assertEqual(len(result.mLambda), len(qvalues[6])) self.assertEqual(result.mPi0, r_pi0) for a, b in zip(result.mQValues, r_qvalues): self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b)) for a, b in zip(result.mPassed, qvalues[5]): self.assertEqual( a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
def checkFDR(self, pi0_method): result = Stats.doFDR( self.mPvalues, fdr_level=0.05, pi0_method=pi0_method) R("""require ('qvalue')""") qvalues = R.qvalue(ro.FloatVector(self.mPvalues), fdr_level=0.05, pi0_method=pi0_method) assert qvalues.names[1] == "pi0" assert qvalues.names[2] == "qvalues" assert qvalues.names[5] == "significant" assert qvalues.names[6] == "lambda" r_qvalues = qvalues[2] r_pi0 = qvalues[1][0] self.assertEqual(len(result.mQValues), len(qvalues[2])) self.assertEqual(len(result.mLambda), len(qvalues[6])) self.assertEqual(result.mPi0, r_pi0) for a, b in zip(result.mQValues, r_qvalues): self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b)) for a, b in zip(result.mPassed, qvalues[5]): self.assertEqual( a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
def computeFDR(all_results, qvalue_method="storey"): '''compute FDR. update GOResult structure with field .fdr ''' # flatten all_results results = [] for key, data in all_results.iteritems(): results.extend(data.mResults.values()) observed_min_pvalues = [min( x.mProbabilityOverRepresentation, x.mProbabilityUnderRepresentation) for x in results] if qvalue_method == "storey": # compute fdr via Storey's method fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1) E.info("estimated proportion of true null hypotheses = %6.4f" % fdr_data.mPi0) if fdr_data.mPi0 < 0.1: E.warn( "estimated proportion of true null hypotheses is " "less than 10%% (%6.4f)" % fdr_data.mPi0) for result, qvalue in zip(results, fdr_data.mQValues): result.fdr = qvalue elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" raise NotImplementedError("empirical needs some work")
def doOldFDR(options, args): """apply fdr to output of annotator.""" # read input annotators = [] for filename in args: infile = open(filename, "r") annotators.append(readAnnotator(infile)) infile.close() # apply filters and create diagnostic plots for filename, data in zip(args, annotators): ninput = len(data) pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=options.fdr) except ValueError, msg: E.warn("%s: fdr could not be computed - no filtering: %s" % (filename, msg)) continue qvalues.plot(filename + "_diagnostics.png") data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
def checkFDR(self, **kwargs): old = Stats.doFDR(self.pvalues, **kwargs) # print old.mQValues[:10] # print old.mPi0 new = Stats.doFDRPython(self.pvalues, **kwargs) # print new.mQValues[:10] # print new.mPi0 # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3) self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error) for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues): self.assertTrue( getRelativeError(a, b) < self.max_error, "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" % (getRelativeError(a, b), self.max_error, pvalue, a, b))
def checkFDR(self, **kwargs): old = Stats.doFDR(self.pvalues, **kwargs) # print old.mQValues[:10] # print old.mPi0 new = Stats.doFDRPython(self.pvalues, **kwargs) # print new.mQValues[:10] # print new.mPi0 # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3) self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error) for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues): self.assertTrue(getRelativeError(a, b) < self.max_error, "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" % (getRelativeError(a, b), self.max_error, pvalue, a, b))
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % ( data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search( "^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default]." ) parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default]." ) parser.add_option("--without-combine", dest="combine", action = "store_false", help="combine overlapping motifs [default=%default]." ) parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices = ("per-sequence", "all", "xall"), help="qvalue threshold [default=%default]." ) parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default]." ) parser.add_option("-a", "--arrangements", dest="arrangements", type="string", help ="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust","repeatmasker"), help ="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action = "store_true", help="output stats [default=%default]." ) parser.add_option("--add-sequence", dest="add_sequence", action = "store_true", help="add sequence information [default=%default]." ) parser.set_defaults( iterations = 100, qvalue_threshold = 0.05, motif = "rxrvdr", fdr_control = "all", combine = True, arrangements = None, mask = None, output_stats = False, add_sequence = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements == None: options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ] else: options.arrangements = options.arrangements.split(",") options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) ) if options.add_sequence: options.stdout.write( "\tsequence" ) options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask ) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) ) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples = options.iterations ) results = matcher.run( masked_seqs, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: results = Nubiscan.combineMotifs( results ) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives ] ) else: alternatives = "" options.stdout.write( "\t".join( ( map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives) ) ) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement( s ) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write( "\t%s" % s ) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile( "fdr" ) outfile.write("bin\thist\tnobserved\n" ) for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = None ) for m in mm: matches.append( m._replace( sequence = seq.title ) ) # estimate qvalues for all matches across all sequences pvalues = [ x.pvalue for x in matches ] fdr = Stats.doFDR( pvalues ) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append( m._replace( qvalue = qvalue ) ) if options.combine: results = Nubiscan.combineMotifs( results ) # output for r in results: options.stdout.write( "\t".join( ( r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue ) ) + "\n" ) noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: result = Nubiscan.combineMotifs( result ) t = re.sub(" .*","", seq.title) for r in result: options.stdout.write( "\t".join( ( t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue ) ) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
def doFDR(options, args): # read input annotators = [] for filename in args: infile = open(filename, "r") annotators.append(readAnnotator(infile)) infile.close() do_filter = options.fdr_qvalue is not None extra_headers = set() for data, fdr, synonyms, input_files in annotators: for key, value in input_files.iteritems(): extra_headers.add(key) extra_headers = sorted(list(extra_headers)) # note: id used to be file options.stdout.write("id\tover\tcategory\tpvalue\tfold\tobserved\texpected\tci95low\tci95high\tstddev\tfdr\tqvalue\t%s\n" % "\t".join(extra_headers)) # apply filters and create diagnostic plots for filename, vv in zip(args, annotators): data, fdr, synonyms, input_files = vv ninput = len(data) E.info("processing %s with %i data points" % (filename, ninput)) no_fdr = False if options.fdr_method in ("annotator", "annotator-estimate"): pvalues = fdr.keys() pvalues.sort() pvalues.reverse() for pvalue in pvalues: try: d = fdr[pvalue]["Significant"] except KeyError: continue if d.mObserved == 0: E.info("no data after fdr") break elif d.mAverage / d.mObserved < options.fdr_qvalue: E.info("filtering with P-value of %f" % pvalue) if do_filter: data = [x for x in data if x.mPValue < pvalue] for d in data: if d.mPValue < pvalue: d.mFDR = 1 d.mQValue = options.fdr_qvalue break else: E.warn("fdr could not be computed - compute more samples (at P = %f, actual fdr=%f)" % (pvalue, d.mAverage / d.mObserved)) no_fdr = True if options.fdr_method == "estimate" or (options.fdr_method == "annotator-estimate" and no_fdr): E.info("estimating FDR from observed P-Values") pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=options.fdr_qvalue) except ValueError, msg: E.warn("fdr could not be computed - no output: %s" % msg) no_fdr = True else: for d, p, q in zip(data, qvalues.mPassed, qvalues.mQValues): if p: d.mFDR = 1 d.mQValue = q if do_filter: data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]] if do_filter and no_fdr: data = [] nremoved = ninput - len(data) E.info("%s: %i data points left, %i removed" % (filename, len(data), nremoved)) extra_values = [] for key in extra_headers: if key in input_files: extra_values.append(input_files[key]) else: extra_values.append("") extra_values = "\t".join(map(str, extra_values)) for d in data: if d.mFoldChange < 1: code = "-" else: code = "+" try: id = re.search(options.regex_id, filename).groups()[0] except AttributeError: id = filename options.stdout.write("%s\t%s\t%s\t%e\t%6.4f\t%f\t%f\t%f\t%f\t%f\t%i\t%e\t%s\n" % (id, code, d.mAnnotation, d.mPValue, d.mFoldChange, d.mObserved, d.mExpected, d.mCI95[0], d.mCI95[1], d.mStdDev, d.mFDR, d.mQValue, extra_values))
def computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info): pairs = sorted(go_results.mResults.items()) E.info("calculating the FDRs using method `%s`" % options.qvalue_method) samples = None observed_min_pvalues = [ min(x[1].mProbabilityOverRepresentation, x[1].mProbabilityUnderRepresentation) for x in pairs ] fdrs = {} method = options.qvalue_method if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(observed_min_pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(observed_min_pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues] for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" ####################################################################### # sampling # for each GO-category: # get maximum and minimum counts in x samples -> calculate minimum/maximum significance # get average and stdev counts in x samples -> calculate z-scores for # test set samples, simulation_min_pvalues = getSamples(gene2go, foreground, background, options, test_ontology, go2info) # compute P-values from sampling observed_min_pvalues.sort() observed_min_pvalues = numpy.array(observed_min_pvalues) sample_size = options.sample for k, v in pairs: if k in samples: s = samples[k] else: raise KeyError("category %s not in samples" % k) # calculate values for z-score if s.mStddev > 0: zscore = abs(float(v.mSampleCountsCategory) - s.mMean) / s.mStddev else: zscore = 0.0 ############################################################# # FDR: # For each p-Value p at node n: # a = average number of nodes in each simulation run with P-Value < p # this can be obtained from the array of all p-values and all nodes # simply divided by the number of samples. # aka: expfpos=experimental false positive rate # b = number of nodes in observed data, that have a P-Value of less than p. # aka: pos=positives in observed data # fdr = a/b pvalue = v.mPValue # calculate values for FDR: # nfdr = number of entries with P-Value better than node. a = 0 while a < len(simulation_min_pvalues) and \ simulation_min_pvalues[a] < pvalue: a += 1 a = float(a) / float(sample_size) b = 0 while b < len(observed_min_pvalues) and \ observed_min_pvalues[b] < pvalue: b += 1 if b > 0: fdr = min(1.0, float(a) / float(b)) else: fdr = 1.0 fdrs[k] = (fdr, a, b) else: qvalues = R['p.adjust'](observed_min_pvalues, method=options.qvalue_method) fdr_data = Stats.FDRResult() fdr_data.mQValues = list(qvalues) for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) return fdrs, samples, method
def analysePolyphen(infile, outfile): '''compute enrichment of SNPs within genes and deleterious SNPs within SNPs within genes. del: enrichment of deleterious snps within snps per gene len: enrichment of snps within genes com: enrichment of deleterious snps within gene ''' table = P.toTable(infile) tablename_map = "polyphen_map" dbhandle = connect() cc = dbhandle.cursor() statement = ''' SELECT i.gene_id, COUNT(DISTINCT map.locus_id) as nsnps, COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious, MAX(s.length) FROM %(table)s as t, %(tablename_map)s as map, annotations.protein_stats as s, annotations.transcript_info as i WHERE map.snp_id = t.snp_id AND i.transcript_id = map.transcript_id AND s.protein_id = map.protein_id GROUP BY i.gene_id ''' % locals() data = cc.execute(statement).fetchall() statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) FROM annotations.transcript_info AS i, annotations.protein_stats AS s WHERE s.protein_id = i.protein_id GROUP BY i.gene_id''' gene_ids = cc.execute(statement).fetchall() total_nsnps = sum([x[1] for x in data]) total_ndel = sum([x[2] for x in data]) total_length = sum([x[1] for x in gene_ids]) del_p = float(total_ndel) / total_nsnps len_p = float(total_nsnps) / total_length com_p = float(total_ndel) / total_length E.info("del: background probability: %i/%i = %f" % (total_ndel, total_nsnps, del_p)) E.info("len: background probability: %i/%i = %f" % (total_nsnps, total_length, len_p)) E.info("com: background probability: %i/%i = %f" % (total_ndel, total_length, com_p)) outf = open(outfile, "w") outf.write("\t".join(("gene_id", "code", "length", "nsnps", "ndel", "del_p", "del_pvalue", "del_qvalue", "len_p", "len_pvalue", "len_qvalue", "com_p", "com_pvalue", "com_qvalue", )) + "\n") del_pvalues, len_pvalues, com_pvalues = [], [], [] for gene_id, nsnps, ndel, length in data: # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x # > X ). del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p)) len_pvalues.append( scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p)) com_pvalues.append( scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p)) if len(del_pvalues) > 10: del_qvalues = Stats.doFDR(del_pvalues).mQValues else: E.warn("no FDR computed for del") del_qvalues = del_pvalues if len(len_pvalues) > 10: len_qvalues = Stats.doFDR(len_pvalues).mQValues else: E.warn("no FDR computed for del") len_qvalues = len_pvalues if len(com_pvalues) > 10: com_q = Stats.doFDR(com_pvalues).mQValues else: E.warn("no FDR computed for com") com_qvalues = com_pvalues fdr = PARAMS["polyphen_fdr"] found = set() for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \ zip(data, del_pvalues, del_qvalues, len_pvalues, len_qvalues, com_pvalues, com_qvalues, ): gene_id, nsnps, ndel, length = a found.add(gene_id) del_p = float(ndel) / nsnps len_p = float(nsnps) / length code = "".join([str(int(x < fdr)) for x in (del_qvalue, len_qvalue, com_qvalue)]) outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % int(nsnps), "%i" % int(ndel), "%6.4f" % del_p, "%6.4g" % del_pvalue, "%6.4g" % del_qvalue, "%6.4f" % len_p, "%6.4g" % len_pvalue, "%6.4g" % len_qvalue, "%6.4f" % com_p, "%6.4g" % com_pvalue, "%6.4g" % com_qvalue, )) + "\n") # add missing genes: code = "---" for gene_id, length in gene_ids: if gene_id in found: continue outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % 0, "%i" % 0, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, )) + "\n") outf.close()
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % ( data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() tempfilename = tempf2.name print tempf1.name print tempf2.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=category --index=track,geneset,annotationset --index=geneset --index=annotationset --index=goid --table=%(tablename)s < %(tempfilename)s > %(outfile)s ''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default].") parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default].") parser.add_option("--without-combine", dest="combine", action="store_false", help="combine overlapping motifs [default=%default].") parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices=("per-sequence", "all", "xall"), help="qvalue threshold [default=%default].") parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default].") parser.add_option( "-a", "--arrangements", dest="arrangements", type="string", help="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust", "repeatmasker"), help="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action="store_true", help="output stats [default=%default].") parser.add_option("--add-sequence", dest="add_sequence", action="store_true", help="add sequence information [default=%default].") parser.set_defaults( iterations=100, qvalue_threshold=0.05, motif="rxrvdr", fdr_control="all", combine=True, arrangements=None, mask=None, output_stats=False, add_sequence=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements is None: options.arrangements = ["DR%s" % x for x in range(0, 15) ] + ["ER%s" % x for x in range(0, 15)] else: options.arrangements = options.arrangements.split(",") options.stdout.write("%s" % "\t".join(Nubiscan.NubiscanMatch._fields)) if options.add_sequence: options.stdout.write("\tsequence") options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences([x.sequence for x in seqs], options.mask) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate([re.sub("\s.*", "", x.title) for x in seqs])) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples=options.iterations) results = matcher.run(masked_seqs, options.arrangements, qvalue_threshold=options.qvalue_threshold) if options.combine: results = Nubiscan.combineMotifs(results) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives]) else: alternatives = "" options.stdout.write("\t".join( (map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives))) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement(s) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write("\t%s" % s) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile("fdr") outfile.write("bin\thist\tnobserved\n") for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write("%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples=options.iterations) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run(seq.sequence, options.arrangements, qvalue_threshold=None) for m in mm: matches.append(m._replace(sequence=seq.title)) # estimate qvalues for all matches across all sequences pvalues = [x.pvalue for x in matches] fdr = Stats.doFDR(pvalues) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append(m._replace(qvalue=qvalue)) if options.combine: results = Nubiscan.combineMotifs(results) # output for r in results: options.stdout.write("\t".join( (r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue)) + "\n") noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples=options.iterations) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run(seq.sequence, options.arrangements, qvalue_threshold=options.qvalue_threshold) if options.combine: result = Nubiscan.combineMotifs(result) t = re.sub(" .*", "", seq.title) for r in result: options.stdout.write("\t".join( (t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue)) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info, options): '''compute pairwise enrichment between sets. The purpose of this method is to find if there are categories that are differently enriched in a pair of gene lists. The appropriate test here is the Chi-Squared test. The assumption is that the background set is the same in all gene lists. The workflow is thus:: for each combination of two gene lists: for each GO category: get counts in foreground, total counts of foreground compute chi-square enrichment output save P-value apply fdr - output significant differences. ''' dicts = [dict(x) for x in results_per_genelist] PairResult = collections.namedtuple("PairResult", "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description") outfile = getFileName(options, go=test_ontology, section='summary', set="pairs") outfile.write( "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n") results = [] total = len(dicts) * (len(dicts) - 1) / 2 iteration = 0 min_observed_counts = options.pairs_min_observed_counts for x, genelist1 in enumerate(sorted(dicts)): x_go_categories = set(genelist1.keys()) for y, genelist2 in enumerate(sorted(dicts[:x])): iteration += 1 if iteration % 10 == 0: E.info("iteration: %i/%i (%5.2f%%)" % (iteration, total, 100.0 * iteration / total)) y_go_categories = set(genelist2.keys()) shared = x_go_categories.intersection(y_go_categories) c = E.Counter() for category in shared: c.shared += 1 xx = genelist1[category] yy = genelist2[category] # discard all tests with few observations in the observed # counts if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts: c.skipped += 1 continue observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory) aa, bb, cc, dd = \ (xx.mSampleCountsCategory, yy.mSampleCountsCategory, xx.mSampleCountsTotal - xx.mSampleCountsCategory, yy.mSampleCountsTotal - yy.mSampleCountsCategory) if cc == dd == 0: c.skipped += 1 continue c.tested += 1 fisher, pvalue = scipy.stats.fisher_exact(numpy.array( ((aa, bb), (cc, dd)))) if pvalue < 0.05: c.significant_pvalue += 1 else: c.insignificant_pvalue += 1 results.append(PairResult._make((category, labels[x], labels[y], xx.mSampleCountsCategory, xx.mSampleCountsTotal, xx.mPValue, xx.mQValue, yy.mSampleCountsCategory, yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue, 1.0, go2info[category].mDescription))) outfile.write("\t".join(map(str, (labels[x], labels[y], len(x_go_categories), len(y_go_categories), c.shared, c.skipped, c.tested, c.significant_pvalue, c.insignicant_pvalue))) + "\n") if options.output_filename_pattern: outfile.close() if options.fdr: pvalues = [x.pvalue for x in results] if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in pvalues] qvalues = fdr_data.mQValues else: qvalues = R['p.adjust'](pvalues, method=options.qvalue_method) # update qvalues results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)] outfile = getFileName(options, go=test_ontology, section='pairs', set="pairs") outfile.write("\t".join(PairResult._fields) + "\n") for result in results: outfile.write("\t".join(map(str, result)) + "\n") if options.output_filename_pattern: outfile.close()
def computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info): pairs = sorted(go_results.mResults.items()) E.info("calculating the FDRs using method `%s`" % options.qvalue_method) samples = None observed_min_pvalues = [min(x[1].mProbabilityOverRepresentation, x[1].mProbabilityUnderRepresentation) for x in pairs] fdrs = {} method = options.qvalue_method if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(observed_min_pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(observed_min_pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues] for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" ####################################################################### # sampling # for each GO-category: # get maximum and minimum counts in x samples -> calculate minimum/maximum significance # get average and stdev counts in x samples -> calculate z-scores for # test set samples, simulation_min_pvalues = getSamples(gene2go, foreground, background, options, test_ontology, go2info) # compute P-values from sampling observed_min_pvalues.sort() observed_min_pvalues = numpy.array(observed_min_pvalues) sample_size = options.sample for k, v in pairs: if k in samples: s = samples[k] else: raise KeyError("category %s not in samples" % k) # calculate values for z-score if s.mStddev > 0: zscore = abs( float(v.mSampleCountsCategory) - s.mMean) / s.mStddev else: zscore = 0.0 ############################################################# # FDR: # For each p-Value p at node n: # a = average number of nodes in each simulation run with P-Value < p # this can be obtained from the array of all p-values and all nodes # simply divided by the number of samples. # aka: expfpos=experimental false positive rate # b = number of nodes in observed data, that have a P-Value of less than p. # aka: pos=positives in observed data # fdr = a/b pvalue = v.mPValue # calculate values for FDR: # nfdr = number of entries with P-Value better than node. a = 0 while a < len(simulation_min_pvalues) and \ simulation_min_pvalues[a] < pvalue: a += 1 a = float(a) / float(sample_size) b = 0 while b < len(observed_min_pvalues) and \ observed_min_pvalues[b] < pvalue: b += 1 if b > 0: fdr = min(1.0, float(a) / float(b)) else: fdr = 1.0 fdrs[k] = (fdr, a, b) else: qvalues = R['p.adjust']( observed_min_pvalues, method=options.qvalue_method) fdr_data = Stats.FDRResult() fdr_data.mQValues = list(qvalues) for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) return fdrs, samples, method
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info, options): '''compute pairwise enrichment between sets. The purpose of this method is to find if there are categories that are differently enriched in a pair of gene lists. The appropriate test here is the Chi-Squared test. The assumption is that the background set is the same in all gene lists. The workflow is thus:: for each combination of two gene lists: for each GO category: get counts in foreground, total counts of foreground compute chi-square enrichment output save P-value apply fdr - output significant differences. ''' dicts = [dict(x) for x in results_per_genelist] PairResult = collections.namedtuple( "PairResult", "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description" ) outfile = getFileName(options, go=test_ontology, section='summary', set="pairs") outfile.write( "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n" ) results = [] total = len(dicts) * (len(dicts) - 1) / 2 iteration = 0 min_observed_counts = options.pairs_min_observed_counts for x, genelist1 in enumerate(sorted(dicts)): x_go_categories = set(genelist1.keys()) for y, genelist2 in enumerate(sorted(dicts[:x])): iteration += 1 if iteration % 10 == 0: E.info("iteration: %i/%i (%5.2f%%)" % (iteration, total, 100.0 * iteration / total)) y_go_categories = set(genelist2.keys()) shared = x_go_categories.intersection(y_go_categories) c = E.Counter() for category in shared: c.shared += 1 xx = genelist1[category] yy = genelist2[category] # discard all tests with few observations in the observed # counts if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts: c.skipped += 1 continue observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory) aa, bb, cc, dd = \ (xx.mSampleCountsCategory, yy.mSampleCountsCategory, xx.mSampleCountsTotal - xx.mSampleCountsCategory, yy.mSampleCountsTotal - yy.mSampleCountsCategory) if cc == dd == 0: c.skipped += 1 continue c.tested += 1 fisher, pvalue = scipy.stats.fisher_exact( numpy.array(((aa, bb), (cc, dd)))) if pvalue < 0.05: c.significant_pvalue += 1 else: c.insignificant_pvalue += 1 results.append( PairResult._make( (category, labels[x], labels[y], xx.mSampleCountsCategory, xx.mSampleCountsTotal, xx.mPValue, xx.mQValue, yy.mSampleCountsCategory, yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue, 1.0, go2info[category].mDescription))) outfile.write("\t".join( map(str, (labels[x], labels[y], len(x_go_categories), len(y_go_categories), c.shared, c.skipped, c.tested, c.significant_pvalue, c.insignicant_pvalue))) + "\n") if options.output_filename_pattern: outfile.close() if options.fdr: pvalues = [x.pvalue for x in results] if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in pvalues] qvalues = fdr_data.mQValues else: qvalues = R['p.adjust'](pvalues, method=options.qvalue_method) # update qvalues results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)] outfile = getFileName(options, go=test_ontology, section='pairs', set="pairs") outfile.write("\t".join(PairResult._fields) + "\n") for result in results: outfile.write("\t".join(map(str, result)) + "\n") if options.output_filename_pattern: outfile.close()
def Collect(infile, with_headers=False, annotator_format=False, use_annotator_fdr=False, delims="", ignore="", max_pvalue=1.0, max_qvalue=None): """read input table.""" data = [] lines = [x for x in infile.readlines() if x[0] != "#"] if len(lines) == 0: return data if with_headers: del lines[0] if annotator_format: lines = [line for line in lines if not line.startswith("Iteration")] annotator_fdr = {} annotator_level = None for line in lines: if len(line) == 1: continue # skip trailing blank lines if line.startswith("--"): if line.startswith("-- False"): annotator_level = float( re.search("-- False Discovery summary for p-value (.+):", line).groups()[0]) annotator_fdr[annotator_level] = {} elif line.startswith("-- Category"): pass else: if re.search("insufficiently", line): continue dd = re.split("\s+", line[4:-1]) d = DataFDR() d.mObserved, d.mAverage, d.mMedian, d.m95 = list(map( float, dd[1:])) annotator_fdr[annotator_level][dd[0]] = d continue else: if line[0] == "Z": continue # skip header if len(line[:-1].split('\t')) != 9: continue # HACK: accounts for a bug in Annotator output try: (z, percentchange, pvalue, observed, expected, low95, up95, stddev, description) = line[:-1].split('\t')[:9] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) d = DataPoint() d.mAnnotation = description d.mPValue = float(pvalue) d.mFoldChange = 1.0 + float(percentchange) / 100.0 data.append(d) else: for line in lines: try: (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio, pover, punder, goid, category, description) = line[:-1].split("\t")[:14] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) if code == "+": p = pover else: p = punder d = DataPoint() d.mAnnotation = description d.mPValue = float(p) d.mFoldChange = float(spercent) / float(bpercent) data.append(d) # apply filters for c in delims: for d in data: d.mAnnotation = d.mAnnotation.split(c)[0] for c in ignore: for d in data: d.mAnnotation = d.mAnnotation.replace(c, '') ninput = len(data) no_fdr = False # apply filters if ninput > 0: if max_qvalue is not None: if use_annotator_fdr: pvalues = list(annotator_fdr.keys()) pvalues.sort() pvalues.reverse() for pvalue in pvalues: try: d = annotator_fdr[pvalue]["Significant"] except KeyError: continue if d.mObserved == 0: E.info("no data remaining after fdr filtering") data = [] break elif d.mAverage / d.mObserved < max_qvalue: E.info("filtering with P-value of %f" % pvalue) data = [x for x in data if x.mPValue < pvalue] break else: E.warn("fdr could not be computed - compute more " "samples (at P = %f, actual fdr=%f)" % (pvalue, d.mAverage / d.mObserved)) no_fdr = True if no_fdr: if use_annotator_fdr: E.info("estimating FDR from observed P-Values") pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=max_qvalue) except ValueError as msg: E.warn( "fdr could not be computed - no filtering: %s" % msg) no_fdr = True else: data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]] elif max_pvalue is not None: data = [x for x in data if x.mPValue < max_pvalue] if no_fdr: data = [] nremoved = ninput - len(data) return data, nremoved, no_fdr
def Collect(infile, with_headers=False, annotator_format=False, use_annotator_fdr=False, delims="", ignore="", max_pvalue=1.0, max_qvalue=None): """read input table.""" data = [] lines = [x for x in infile.readlines() if x[0] != "#"] if len(lines) == 0: return data if with_headers: del lines[0] if annotator_format: lines = [line for line in lines if not line.startswith("Iteration")] annotator_fdr = {} annotator_level = None for line in lines: if len(line) == 1: continue # skip trailing blank lines if line.startswith("--"): if line.startswith("-- False"): annotator_level = float( re.search( "-- False Discovery summary for p-value (.+):", line).groups()[0]) annotator_fdr[annotator_level] = {} elif line.startswith("-- Category"): pass else: if re.search("insufficiently", line): continue dd = re.split("\s+", line[4:-1]) d = DataFDR() d.mObserved, d.mAverage, d.mMedian, d.m95 = list( map(float, dd[1:])) annotator_fdr[annotator_level][dd[0]] = d continue else: if line[0] == "Z": continue # skip header if len(line[:-1].split('\t')) != 9: continue # HACK: accounts for a bug in Annotator output try: (z, percentchange, pvalue, observed, expected, low95, up95, stddev, description) = line[:-1].split('\t')[:9] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) d = DataPoint() d.mAnnotation = description d.mPValue = float(pvalue) d.mFoldChange = 1.0 + float(percentchange) / 100.0 data.append(d) else: for line in lines: try: (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio, pover, punder, goid, category, description) = line[:-1].split("\t")[:14] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) if code == "+": p = pover else: p = punder d = DataPoint() d.mAnnotation = description d.mPValue = float(p) d.mFoldChange = float(spercent) / float(bpercent) data.append(d) # apply filters for c in delims: for d in data: d.mAnnotation = d.mAnnotation.split(c)[0] for c in ignore: for d in data: d.mAnnotation = d.mAnnotation.replace(c, '') ninput = len(data) no_fdr = False # apply filters if ninput > 0: if max_qvalue is not None: if use_annotator_fdr: pvalues = list(annotator_fdr.keys()) pvalues.sort() pvalues.reverse() for pvalue in pvalues: try: d = annotator_fdr[pvalue]["Significant"] except KeyError: continue if d.mObserved == 0: E.info("no data remaining after fdr filtering") data = [] break elif d.mAverage / d.mObserved < max_qvalue: E.info("filtering with P-value of %f" % pvalue) data = [x for x in data if x.mPValue < pvalue] break else: E.warn("fdr could not be computed - compute more " "samples (at P = %f, actual fdr=%f)" % (pvalue, d.mAverage / d.mObserved)) no_fdr = True if no_fdr: if use_annotator_fdr: E.info("estimating FDR from observed P-Values") pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR(pvalues, vlambda=vlambda, fdr_level=max_qvalue) except ValueError as msg: E.warn("fdr could not be computed - no filtering: %s" % msg) no_fdr = True else: data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]] elif max_pvalue is not None: data = [x for x in data if x.mPValue < max_pvalue] if no_fdr: data = [] nremoved = ninput - len(data) return data, nremoved, no_fdr
def loadGOs( infiles, outfile, tablename ): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists( indir ): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile ).groups() for filename in glob.glob( os.path.join(indir, "*.overall") ): for line in open(filename, "r" ): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write( "track\tgeneset\tannotationset\t%s" % line ) header = True assert data[10] == "pover" and data[11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11]) continue tempf1.write( "%s\t%s\t%s\t%s" % (track, geneset, annotationset, line) ) pvalues.append( min( float(data[10]), float(data[11]) ) ) tempf1.close() E.info( "analysing %i pvalues" % len(pvalues )) fdr = Stats.doFDR( pvalues ) E.info( "got %i qvalues" % len(fdr.mQValues )) qvalues = ["global_qvalue" ] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip( open(tempf1.name,"r"), qvalues ): tempf2.write( "%s\t%s\n" % (line[:-1], str(qvalue)) ) tempf2.close() tempfilename = tempf2.name print tempf1.name print tempf2.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=category --index=track,geneset,annotationset --index=geneset --index=annotationset --index=goid --table=%(tablename)s < %(tempfilename)s > %(outfile)s ''' P.run()