예제 #1
0
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(self.mPvalues,
                             fdr_level=0.05,
                             pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
예제 #2
0
 def check(self, method):
     '''check for length equality and elementwise equality.'''
     a = R['p.adjust'](self.pvalues, method=method)
     b = Stats.adjustPValues(self.pvalues, method=method)
     self.assertEqual(len(a), len(b))
     for x, y in zip(a, b):
         self.assertAlmostEqual(x, y)
예제 #3
0
    def checkFDR(self, **kwargs):

        old = Stats.doFDR(self.pvalues, **kwargs)
        # print old.mQValues[:10]
        # print old.mPi0
        new = Stats.doFDRPython(self.pvalues, **kwargs)
        # print new.mQValues[:10]
        # print new.mPi0
        # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3)
        self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error)

        for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues):
            self.assertTrue(
                getRelativeError(a, b) < self.max_error,
                "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" %
                (getRelativeError(a, b), self.max_error, pvalue, a, b))
예제 #4
0
    def testAgainstQValue(self):

        R.assign("pvalues", self.pvalues)
        qvalue = R('''qvalue( pvalues )''')
        r_qvalues = qvalue[2]
        r_pi0 = qvalue[1][0]

        new = Stats.doFDRPython(self.pvalues)
        self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error)

        for a, b in zip(r_qvalues, new.mQValues):
            self.assertAlmostEqual(a, b, places=self.nplaces)
예제 #5
0
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in list(self.counts_exons_per_transcript.values()):

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(map(str, (len(self.counts_gene_ids),
                                   len(self.counts_transcript_ids),
                                   single_exon_transcripts,
                                   Stats.Summary(exons_per_transcript),
                                   Stats.Summary(exon_sizes),
                                   Stats.Summary(intron_sizes),
                                   Stats.Summary(transcript_lengths),
                                   )))
예제 #6
0
    def testLRT(self):
        """test that the false positive rate is in the same order as mSignificance.

        Sample from a normal distribution and compare two models:

        1. mean estimated = complex model (1 df)
        2. mean given     = simple model  (0 df)

        Likelihood = P(model | data)
        """
        simple_np = 0
        complex_np = 1

        npassed = 0

        for replicate in range(0, self.mNumReplicates):
            sample = scipy.stats.norm.rvs(size=self.mNumSamples,
                                          loc=0.0,
                                          scale=1.0)
            mean = scipy.mean(sample)

            complex_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0)))
            simple_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0)))

            a = Stats.doLogLikelihoodTest(
                complex_ll,
                complex_np,
                simple_ll,
                simple_np,
                significance_threshold=self.mSignificance)

            if a.mPassed:
                npassed += 1

        r = float(npassed) / self.mNumReplicates

        self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
예제 #7
0
파일: go.py 프로젝트: tw7649116/cgat-flow
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.get_temp_file()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)",
                                                  infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (
                            data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.get_temp_file()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name,
           outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
예제 #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
예제 #9
0
def decorator_median_score(values, start, end, contig):
    """compute median of values."""
    d = Stats.DistributionalParameters(values)
    return d['median'], str(d)
예제 #10
0
def decorator_median_length(intervals, start, end, contig, fasta):
    """compute length distribution."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return d['median'], str(d)
예제 #11
0
def decorator_percent_coverage(intervals, start, end, contig, fasta):
    """compute length of intervals."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return 100.0 * float(d['sum']) / (end - start), str(d)
예제 #12
0
def decorator_max_score(values, start, end, contig):
    """compute minumum of values."""
    d = Stats.DistributionalParameters(values)
    return d['max'], str(d)
예제 #13
0
def decorator_stddev_score(values, start, end, contig):
    """compute stddev of values."""
    d = Stats.DistributionalParameters(values)
    return d['stddev'], str(d)
예제 #14
0
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps,
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t,
                    %(tablename_map)s as map,
                    annotations.protein_stats as s,
                    annotations.transcript_info as i
        WHERE map.snp_id = t.snp_id AND
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length)
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s
                   WHERE s.protein_id = i.protein_id
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join([str(int(x < fdr))
                        for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % int(nsnps),
                              "%i" % int(ndel),
                              "%6.4f" % del_p,
                              "%6.4g" % del_pvalue,
                              "%6.4g" % del_qvalue,
                              "%6.4f" % len_p,
                              "%6.4g" % len_pvalue,
                              "%6.4g" % len_qvalue,
                              "%6.4f" % com_p,
                              "%6.4g" % com_pvalue,
                              "%6.4g" % com_qvalue,
                              )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % 0,
                              "%i" % 0,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              )) + "\n")

    outf.close()
예제 #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--no-empty-bins",
                      dest="no_empty_bins",
                      action="store_true",
                      help="do not display empty bins.")

    parser.add_option("--with-empty-bins",
                      dest="no_empty_bins",
                      action="store_false",
                      help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins",
                      dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format",
                      dest="format",
                      type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section",
                      dest="output_section",
                      type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.open_output_file("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.open_output_file("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.open_output_file("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.open_output_file("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.open_output_file("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.open_output_file("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput, len(values_between), len(values_within), noverlaps))

    E.stop()
예제 #16
0
    def __call__(self, track, slice=None):

        result = odict()

        merged = None
        rocs = []

        for field in self.mFields:
            data = []
            for replicate in EXPERIMENTS.getTracks(track):
                statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals(
                )
                data.append(self.get(statement))

            idx = []
            for x in range(len(data)):
                i = IndexedGenome.IndexedGenome()
                for contig, start, end, peakval in data[x]:
                    i.add(contig, start, end, peakval)
                idx.append(i)

            def _iter(all):
                all.sort()
                last_contig, first_start, last_end, last_value = all[0]
                for contig, start, end, value in all[1:]:
                    if contig != last_contig or last_end < start:
                        yield (last_contig, first_start, last_end)
                        last_contig, first_start, last_end = contig, start, end
                    else:
                        last_end = max(last_end, end)
                yield (last_contig, first_start, last_end)

            if not merged:
                all = [x for x in itertools.chain(*data)]
                merged = list(_iter(all))

            roc_data = []
            for contig, start, end in merged:
                intervals = []
                for i in idx:
                    try:
                        intervals.append(list(i.get(contig, start, end)))
                    except KeyError:
                        continue

                if len(intervals) == 0:
                    continue

                is_repro = len([x for x in intervals if x != []]) == len(data)
                value = max([x[2] for x in itertools.chain(*intervals)])

                # fpr, tpr
                roc_data.append((value, is_repro))

            roc_data.sort()
            roc_data.reverse()

            roc = list(zip(*Stats.computeROC(roc_data)))
            result[field] = odict((("FPR", roc[0]), (field, roc[1])))

        return result