def checkFDR(self, pi0_method): result = Stats.doFDR(self.mPvalues, fdr_level=0.05, pi0_method=pi0_method) R("""require ('qvalue')""") qvalues = R.qvalue(ro.FloatVector(self.mPvalues), fdr_level=0.05, pi0_method=pi0_method) assert qvalues.names[1] == "pi0" assert qvalues.names[2] == "qvalues" assert qvalues.names[5] == "significant" assert qvalues.names[6] == "lambda" r_qvalues = qvalues[2] r_pi0 = qvalues[1][0] self.assertEqual(len(result.mQValues), len(qvalues[2])) self.assertEqual(len(result.mLambda), len(qvalues[6])) self.assertEqual(result.mPi0, r_pi0) for a, b in zip(result.mQValues, r_qvalues): self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b)) for a, b in zip(result.mPassed, qvalues[5]): self.assertEqual( a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
def check(self, method): '''check for length equality and elementwise equality.''' a = R['p.adjust'](self.pvalues, method=method) b = Stats.adjustPValues(self.pvalues, method=method) self.assertEqual(len(a), len(b)) for x, y in zip(a, b): self.assertAlmostEqual(x, y)
def checkFDR(self, **kwargs): old = Stats.doFDR(self.pvalues, **kwargs) # print old.mQValues[:10] # print old.mPi0 new = Stats.doFDRPython(self.pvalues, **kwargs) # print new.mQValues[:10] # print new.mPi0 # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3) self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error) for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues): self.assertTrue( getRelativeError(a, b) < self.max_error, "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" % (getRelativeError(a, b), self.max_error, pvalue, a, b))
def testAgainstQValue(self): R.assign("pvalues", self.pvalues) qvalue = R('''qvalue( pvalues )''') r_qvalues = qvalue[2] r_pi0 = qvalue[1][0] new = Stats.doFDRPython(self.pvalues) self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error) for a, b in zip(r_qvalues, new.mQValues): self.assertAlmostEqual(a, b, places=self.nplaces)
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in list(self.counts_exons_per_transcript.values()): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join(map(str, (len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def testLRT(self): """test that the false positive rate is in the same order as mSignificance. Sample from a normal distribution and compare two models: 1. mean estimated = complex model (1 df) 2. mean given = simple model (0 df) Likelihood = P(model | data) """ simple_np = 0 complex_np = 1 npassed = 0 for replicate in range(0, self.mNumReplicates): sample = scipy.stats.norm.rvs(size=self.mNumSamples, loc=0.0, scale=1.0) mean = scipy.mean(sample) complex_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0))) simple_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0))) a = Stats.doLogLikelihoodTest( complex_ll, complex_np, simple_ll, simple_np, significance_threshold=self.mSignificance) if a.mPassed: npassed += 1 r = float(npassed) / self.mNumReplicates self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.get_temp_file() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % ( data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.get_temp_file() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def decorator_median_score(values, start, end, contig): """compute median of values.""" d = Stats.DistributionalParameters(values) return d['median'], str(d)
def decorator_median_length(intervals, start, end, contig, fasta): """compute length distribution.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return d['median'], str(d)
def decorator_percent_coverage(intervals, start, end, contig, fasta): """compute length of intervals.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return 100.0 * float(d['sum']) / (end - start), str(d)
def decorator_max_score(values, start, end, contig): """compute minumum of values.""" d = Stats.DistributionalParameters(values) return d['max'], str(d)
def decorator_stddev_score(values, start, end, contig): """compute stddev of values.""" d = Stats.DistributionalParameters(values) return d['stddev'], str(d)
def analysePolyphen(infile, outfile): '''compute enrichment of SNPs within genes and deleterious SNPs within SNPs within genes. del: enrichment of deleterious snps within snps per gene len: enrichment of snps within genes com: enrichment of deleterious snps within gene ''' table = P.toTable(infile) tablename_map = "polyphen_map" dbhandle = connect() cc = dbhandle.cursor() statement = ''' SELECT i.gene_id, COUNT(DISTINCT map.locus_id) as nsnps, COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious, MAX(s.length) FROM %(table)s as t, %(tablename_map)s as map, annotations.protein_stats as s, annotations.transcript_info as i WHERE map.snp_id = t.snp_id AND i.transcript_id = map.transcript_id AND s.protein_id = map.protein_id GROUP BY i.gene_id ''' % locals() data = cc.execute(statement).fetchall() statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) FROM annotations.transcript_info AS i, annotations.protein_stats AS s WHERE s.protein_id = i.protein_id GROUP BY i.gene_id''' gene_ids = cc.execute(statement).fetchall() total_nsnps = sum([x[1] for x in data]) total_ndel = sum([x[2] for x in data]) total_length = sum([x[1] for x in gene_ids]) del_p = float(total_ndel) / total_nsnps len_p = float(total_nsnps) / total_length com_p = float(total_ndel) / total_length E.info("del: background probability: %i/%i = %f" % (total_ndel, total_nsnps, del_p)) E.info("len: background probability: %i/%i = %f" % (total_nsnps, total_length, len_p)) E.info("com: background probability: %i/%i = %f" % (total_ndel, total_length, com_p)) outf = open(outfile, "w") outf.write("\t".join(("gene_id", "code", "length", "nsnps", "ndel", "del_p", "del_pvalue", "del_qvalue", "len_p", "len_pvalue", "len_qvalue", "com_p", "com_pvalue", "com_qvalue", )) + "\n") del_pvalues, len_pvalues, com_pvalues = [], [], [] for gene_id, nsnps, ndel, length in data: # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x # > X ). del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p)) len_pvalues.append( scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p)) com_pvalues.append( scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p)) if len(del_pvalues) > 10: del_qvalues = Stats.doFDR(del_pvalues).mQValues else: E.warn("no FDR computed for del") del_qvalues = del_pvalues if len(len_pvalues) > 10: len_qvalues = Stats.doFDR(len_pvalues).mQValues else: E.warn("no FDR computed for del") len_qvalues = len_pvalues if len(com_pvalues) > 10: com_q = Stats.doFDR(com_pvalues).mQValues else: E.warn("no FDR computed for com") com_qvalues = com_pvalues fdr = PARAMS["polyphen_fdr"] found = set() for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \ zip(data, del_pvalues, del_qvalues, len_pvalues, len_qvalues, com_pvalues, com_qvalues, ): gene_id, nsnps, ndel, length = a found.add(gene_id) del_p = float(ndel) / nsnps len_p = float(nsnps) / length code = "".join([str(int(x < fdr)) for x in (del_qvalue, len_qvalue, com_qvalue)]) outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % int(nsnps), "%i" % int(ndel), "%6.4f" % del_p, "%6.4g" % del_pvalue, "%6.4g" % del_qvalue, "%6.4f" % len_p, "%6.4g" % len_pvalue, "%6.4g" % len_qvalue, "%6.4f" % com_p, "%6.4g" % com_pvalue, "%6.4g" % com_qvalue, )) + "\n") # add missing genes: code = "---" for gene_id, length in gene_ids: if gene_id in found: continue outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % 0, "%i" % 0, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, )) + "\n") outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating " "them to range border.") parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf", "bed"), help="input file format [%default].") parser.add_option("--method", dest="methods", type="choice", action="append", choices=("all", "hist", "stats", "overlaps", "values"), help="methods to apply [%default].") parser.add_option("--output-section", dest="output_section", type="choice", choices=("all", "size", "distance"), help="data to compute [%default].") parser.set_defaults( no_empty_bins=True, bin_size=None, dynamic_bins=False, ignore_out_of_range=False, min_value=None, max_value=None, nonull=None, missing_value="na", output_filename_pattern="%s", methods=[], output_section="all", format="gff", ) (options, args) = E.start(parser, add_output_options=True) if "all" in options.methods: options.methods = ("hist", "stats", "overlaps") if not options.output_filename_pattern: options.output_filename_pattern = "%s" if len(options.methods) == 0: raise ValueError( "please provide counting method using --method option") if options.format in ("gff", "gtf"): gffs = GTF.iterator(options.stdin) elif options.format == "bed": gffs = Bed.iterator(options.stdin) values_between = [] values_within = [] values_overlaps = [] if "overlaps" in options.methods: if not options.output_filename_pattern: options.output_filename_pattern = "%s" outfile_overlaps = E.open_output_file("overlaps") else: outfile_overlaps = None last = None ninput, noverlaps = 0, 0 for this in gffs: ninput += 1 values_within.append(this.end - this.start) if last and last.contig == this.contig: if this.start < last.end: noverlaps += 1 if outfile_overlaps: outfile_overlaps.write("%s\t%s\n" % (str(last), str(this))) values_overlaps.append( min(this.end, last.end) - max(last.start, this.start)) if this.end > last.end: last = this continue else: values_between.append(this.start - last.end) # if this.start - last.end < 10: # print str(last) # print str(this) # print "==" values_overlaps.append(0) last = this if "hist" in options.methods: outfile = E.open_output_file("hist") h_within = Histogram.Calculate( values_within, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) h_between = Histogram.Calculate( values_between, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if "all" == options.output_section: outfile.write("residues\tsize\tdistance\n") combined_histogram = Histogram.Combine( [h_within, h_between], missing_value=options.missing_value) Histogram.Write(outfile, combined_histogram, nonull=options.nonull) elif options.output_section == "size": outfile.write("residues\tsize\n") Histogram.Write(outfile, h_within, nonull=options.nonull) elif options.output_section == "distance": outfile.write("residues\tdistance\n") Histogram.Write(outfile, h_between, nonull=options.nonull) outfile.close() if "stats" in options.methods: outfile = E.open_output_file("stats") outfile.write("data\t%s\n" % Stats.Summary().getHeader()) if options.output_section in ("size", "all"): outfile.write("size\t%s\n" % str(Stats.Summary(values_within))) if options.output_section in ("distance", "all"): outfile.write("distance\t%s\n" % str(Stats.Summary(values_between))) outfile.close() if "values" in options.methods: outfile = E.open_output_file("distances") outfile.write("distance\n%s\n" % "\n".join(map(str, values_between))) outfile.close() outfile = E.open_output_file("sizes") outfile.write("size\n%s\n" % "\n".join(map(str, values_within))) outfile.close() outfile = E.open_output_file("overlaps") outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps))) outfile.close() E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, len(values_between), len(values_within), noverlaps)) E.stop()
def __call__(self, track, slice=None): result = odict() merged = None rocs = [] for field in self.mFields: data = [] for replicate in EXPERIMENTS.getTracks(track): statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals( ) data.append(self.get(statement)) idx = [] for x in range(len(data)): i = IndexedGenome.IndexedGenome() for contig, start, end, peakval in data[x]: i.add(contig, start, end, peakval) idx.append(i) def _iter(all): all.sort() last_contig, first_start, last_end, last_value = all[0] for contig, start, end, value in all[1:]: if contig != last_contig or last_end < start: yield (last_contig, first_start, last_end) last_contig, first_start, last_end = contig, start, end else: last_end = max(last_end, end) yield (last_contig, first_start, last_end) if not merged: all = [x for x in itertools.chain(*data)] merged = list(_iter(all)) roc_data = [] for contig, start, end in merged: intervals = [] for i in idx: try: intervals.append(list(i.get(contig, start, end))) except KeyError: continue if len(intervals) == 0: continue is_repro = len([x for x in intervals if x != []]) == len(data) value = max([x[2] for x in itertools.chain(*intervals)]) # fpr, tpr roc_data.append((value, is_repro)) roc_data.sort() roc_data.reverse() roc = list(zip(*Stats.computeROC(roc_data))) result[field] = odict((("FPR", roc[0]), (field, roc[1]))) return result