コード例 #1
0
ファイル: genelist_analysis.py プロジェクト: santayana/cgat
def computeFDR(all_results, qvalue_method="storey"):
    '''compute FDR.

    update GOResult structure with field .fdr
    '''

    # flatten all_results
    results = []
    for key, data in all_results.iteritems():
        results.extend(data.mResults.values())

    observed_min_pvalues = [
        min(x.mProbabilityOverRepresentation,
            x.mProbabilityUnderRepresentation) for x in results
    ]

    if qvalue_method == "storey":

        # compute fdr via Storey's method
        fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1)

        E.info("estimated proportion of true null hypotheses = %6.4f" %
               fdr_data.mPi0)

        if fdr_data.mPi0 < 0.1:
            E.warn("estimated proportion of true null hypotheses is "
                   "less than 10%%  (%6.4f)" % fdr_data.mPi0)

        for result, qvalue in zip(results, fdr_data.mQValues):
            result.fdr = qvalue

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"
        raise NotImplementedError("empirical needs some work")
コード例 #2
0
ファイル: Stats_test.py プロジェクト: logust79/cgat-apps
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(self.mPvalues,
                             fdr_level=0.05,
                             pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
コード例 #3
0
ファイル: Stats_test.py プロジェクト: lesheng/cgat
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(
            self.mPvalues, fdr_level=0.05, pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
コード例 #4
0
def computeFDR(all_results,
               qvalue_method="storey"):
    '''compute FDR.

    update GOResult structure with field .fdr
    '''

    # flatten all_results
    results = []
    for key, data in all_results.iteritems():
        results.extend(data.mResults.values())

    observed_min_pvalues = [min(
        x.mProbabilityOverRepresentation,
        x.mProbabilityUnderRepresentation) for x in results]

    if qvalue_method == "storey":

        # compute fdr via Storey's method
        fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1)

        E.info("estimated proportion of true null hypotheses = %6.4f" %
               fdr_data.mPi0)

        if fdr_data.mPi0 < 0.1:
            E.warn(
                "estimated proportion of true null hypotheses is "
                "less than 10%%  (%6.4f)" % fdr_data.mPi0)

        for result, qvalue in zip(results, fdr_data.mQValues):
            result.fdr = qvalue

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"
        raise NotImplementedError("empirical needs some work")
コード例 #5
0
ファイル: annotator2tsv.py プロジェクト: lesheng/cgat
def doOldFDR(options, args):
    """apply fdr to output of annotator."""

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    # apply filters and create diagnostic plots
    for filename, data in zip(args, annotators):
        ninput = len(data)
        pvalues = [x.mPValue for x in data]
        vlambda = numpy.arange(0, max(pvalues), 0.05)
        try:
            qvalues = Stats.doFDR(
                pvalues, vlambda=vlambda, fdr_level=options.fdr)
        except ValueError, msg:
            E.warn("%s: fdr could not be computed - no filtering: %s" %
                   (filename, msg))
            continue

        qvalues.plot(filename + "_diagnostics.png")

        data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
コード例 #6
0
ファイル: Stats_test.py プロジェクト: logust79/cgat-apps
    def checkFDR(self, **kwargs):

        old = Stats.doFDR(self.pvalues, **kwargs)
        # print old.mQValues[:10]
        # print old.mPi0
        new = Stats.doFDRPython(self.pvalues, **kwargs)
        # print new.mQValues[:10]
        # print new.mPi0
        # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3)
        self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error)

        for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues):
            self.assertTrue(
                getRelativeError(a, b) < self.max_error,
                "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" %
                (getRelativeError(a, b), self.max_error, pvalue, a, b))
コード例 #7
0
ファイル: Stats_test.py プロジェクト: lesheng/cgat
    def checkFDR(self, **kwargs):

        old = Stats.doFDR(self.pvalues, **kwargs)
        # print old.mQValues[:10]
        # print old.mPi0
        new = Stats.doFDRPython(self.pvalues, **kwargs)
        # print new.mQValues[:10]
        # print new.mPi0
        # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3)
        self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error)

        for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues):
            self.assertTrue(getRelativeError(a, b) < self.max_error,
                            "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" %
                            (getRelativeError(a, b),
                             self.max_error,
                             pvalue, a, b))
コード例 #8
0
ファイル: PipelineGO.py プロジェクト: ichobits/CGATPipelines
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)",
                                                  infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (
                            data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name,
           outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
コード例 #9
0
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search(
            "^(\S+)_vs_(\S+)\.(\S+)", infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name, outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
コード例 #10
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default]."  )

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("--without-combine", dest="combine", action = "store_false",
                      help="combine overlapping motifs [default=%default]."  )

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices = ("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help ="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust","repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action = "store_true",
                      help="output stats [default=%default]."  )

    parser.add_option("--add-sequence", dest="add_sequence", action = "store_true",
                      help="add sequence information [default=%default]."  )

    parser.set_defaults(
        iterations = 100,
        qvalue_threshold = 0.05,
        motif = "rxrvdr",
        fdr_control = "all",
        combine = True,
        arrangements = None,
        mask = None,
        output_stats = False,
        add_sequence = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements == None:
        options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ]
    else:
        options.arrangements = options.arrangements.split(",")
        
    options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) )
    if options.add_sequence: options.stdout.write( "\tsequence" )
    options.stdout.write("\n")

    if options.motif == 'nr': sense_matrix = NR
    elif options.motif == "rxrvdr": sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask )
        else:
            masked_seqs = [x.sequence for x in seqs]
            
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
コード例 #11
0
ファイル: annotator2tsv.py プロジェクト: lesheng/cgat
def doFDR(options, args):

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    do_filter = options.fdr_qvalue is not None

    extra_headers = set()
    for data, fdr, synonyms, input_files in annotators:
        for key, value in input_files.iteritems():
            extra_headers.add(key)
    extra_headers = sorted(list(extra_headers))

    # note: id used to be file
    options.stdout.write("id\tover\tcategory\tpvalue\tfold\tobserved\texpected\tci95low\tci95high\tstddev\tfdr\tqvalue\t%s\n" %
                         "\t".join(extra_headers))

    # apply filters and create diagnostic plots
    for filename, vv in zip(args, annotators):
        data, fdr, synonyms, input_files = vv

        ninput = len(data)

        E.info("processing %s with %i data points" % (filename, ninput))
        no_fdr = False

        if options.fdr_method in ("annotator", "annotator-estimate"):
            pvalues = fdr.keys()
            pvalues.sort()
            pvalues.reverse()
            for pvalue in pvalues:
                try:
                    d = fdr[pvalue]["Significant"]
                except KeyError:
                    continue

                if d.mObserved == 0:
                    E.info("no data after fdr")
                    break

                elif d.mAverage / d.mObserved < options.fdr_qvalue:
                    E.info("filtering with P-value of %f" % pvalue)
                    if do_filter:
                        data = [x for x in data if x.mPValue < pvalue]
                    for d in data:
                        if d.mPValue < pvalue:
                            d.mFDR = 1
                            d.mQValue = options.fdr_qvalue
                    break
                else:
                    E.warn("fdr could not be computed - compute more samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

        if options.fdr_method == "estimate" or (options.fdr_method == "annotator-estimate" and no_fdr):

            E.info("estimating FDR from observed P-Values")
            pvalues = [x.mPValue for x in data]
            vlambda = numpy.arange(0, max(pvalues), 0.05)
            try:
                qvalues = Stats.doFDR(
                    pvalues, vlambda=vlambda, fdr_level=options.fdr_qvalue)
            except ValueError, msg:
                E.warn("fdr could not be computed - no output: %s" % msg)
                no_fdr = True
            else:
                for d, p, q in zip(data, qvalues.mPassed, qvalues.mQValues):
                    if p:
                        d.mFDR = 1
                    d.mQValue = q

                if do_filter:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]

        if do_filter and no_fdr:
            data = []

        nremoved = ninput - len(data)

        E.info("%s: %i data points left, %i removed" %
               (filename, len(data), nremoved))

        extra_values = []
        for key in extra_headers:
            if key in input_files:
                extra_values.append(input_files[key])
            else:
                extra_values.append("")

        extra_values = "\t".join(map(str, extra_values))

        for d in data:
            if d.mFoldChange < 1:
                code = "-"
            else:
                code = "+"

            try:
                id = re.search(options.regex_id, filename).groups()[0]
            except AttributeError:
                id = filename
            options.stdout.write("%s\t%s\t%s\t%e\t%6.4f\t%f\t%f\t%f\t%f\t%f\t%i\t%e\t%s\n" %
                                 (id,
                                  code,
                                  d.mAnnotation,
                                  d.mPValue,
                                  d.mFoldChange,
                                  d.mObserved,
                                  d.mExpected,
                                  d.mCI95[0],
                                  d.mCI95[1],
                                  d.mStdDev,
                                  d.mFDR,
                                  d.mQValue,
                                  extra_values))
コード例 #12
0
ファイル: GO.py プロジェクト: wangdi2014/cgat
def computeFDRs(go_results, foreground, background, options, test_ontology,
                gene2go, go2info):

    pairs = sorted(go_results.mResults.items())

    E.info("calculating the FDRs using method `%s`" % options.qvalue_method)

    samples = None

    observed_min_pvalues = [
        min(x[1].mProbabilityOverRepresentation,
            x[1].mProbabilityUnderRepresentation) for x in pairs
    ]

    fdrs = {}

    method = options.qvalue_method

    if options.qvalue_method == "storey":

        # compute fdr via Storey's method
        try:
            fdr_data = Stats.doFDR(observed_min_pvalues)

        except ValueError as msg:
            E.warn("failure in q-value computation: %s" % msg)
            E.warn("reverting to Bonferroni correction")
            method = "bonf"
            fdr_data = Stats.FDRResult()
            l = float(len(observed_min_pvalues))
            fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues]

        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"

        #######################################################################
        # sampling
        # for each GO-category:
        # get maximum and minimum counts in x samples -> calculate minimum/maximum significance
        # get average and stdev counts in x samples -> calculate z-scores for
        # test set
        samples, simulation_min_pvalues = getSamples(gene2go, foreground,
                                                     background, options,
                                                     test_ontology, go2info)

        # compute P-values from sampling
        observed_min_pvalues.sort()
        observed_min_pvalues = numpy.array(observed_min_pvalues)

        sample_size = options.sample

        for k, v in pairs:

            if k in samples:
                s = samples[k]
            else:
                raise KeyError("category %s not in samples" % k)

            # calculate values for z-score
            if s.mStddev > 0:
                zscore = abs(float(v.mSampleCountsCategory) -
                             s.mMean) / s.mStddev
            else:
                zscore = 0.0

            #############################################################
            # FDR:
            # For each p-Value p at node n:
            #   a = average number of nodes in each simulation run with P-Value < p
            #           this can be obtained from the array of all p-values and all nodes
            #           simply divided by the number of samples.
            #      aka: expfpos=experimental false positive rate
            #   b = number of nodes in observed data, that have a P-Value of less than p.
            #      aka: pos=positives in observed data
            #   fdr = a/b
            pvalue = v.mPValue

            # calculate values for FDR:
            # nfdr = number of entries with P-Value better than node.
            a = 0
            while a < len(simulation_min_pvalues) and \
                    simulation_min_pvalues[a] < pvalue:
                a += 1
            a = float(a) / float(sample_size)
            b = 0
            while b < len(observed_min_pvalues) and \
                    observed_min_pvalues[b] < pvalue:
                b += 1

            if b > 0:
                fdr = min(1.0, float(a) / float(b))
            else:
                fdr = 1.0

            fdrs[k] = (fdr, a, b)
    else:
        qvalues = R['p.adjust'](observed_min_pvalues,
                                method=options.qvalue_method)
        fdr_data = Stats.FDRResult()
        fdr_data.mQValues = list(qvalues)
        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    return fdrs, samples, method
コード例 #13
0
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps, 
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t, 
                    %(tablename_map)s as map, 
                    annotations.protein_stats as s,
                    annotations.transcript_info as i 
        WHERE map.snp_id = t.snp_id AND 
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) 
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s 
                   WHERE s.protein_id = i.protein_id 
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join([str(int(x < fdr))
                        for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % int(nsnps),
                              "%i" % int(ndel),
                              "%6.4f" % del_p,
                              "%6.4g" % del_pvalue,
                              "%6.4g" % del_qvalue,
                              "%6.4f" % len_p,
                              "%6.4g" % len_pvalue,
                              "%6.4g" % len_qvalue,
                              "%6.4f" % com_p,
                              "%6.4g" % com_pvalue,
                              "%6.4g" % com_qvalue,
                              )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % 0,
                              "%i" % 0,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              )) + "\n")

    outf.close()
コード例 #14
0
ファイル: PipelineGO.py プロジェクト: santayana/cgat
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)",
                                                  infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (
                            data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()
    tempfilename = tempf2.name
    print tempf1.name
    print tempf2.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty 
              --index=category 
              --index=track,geneset,annotationset
              --index=geneset
              --index=annotationset
              --index=goid 
              --table=%(tablename)s 
    < %(tempfilename)s
    > %(outfile)s
    '''
    P.run()
コード例 #15
0
ファイル: run_nubiscan.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--iterations",
        dest="iterations",
        type="int",
        help="number of iterations for sampling [default=%default].")

    parser.add_option("-q",
                      "--qvalue",
                      dest="qvalue_threshold",
                      type="float",
                      help="qvalue threshold [default=%default].")

    parser.add_option("--without-combine",
                      dest="combine",
                      action="store_false",
                      help="combine overlapping motifs [default=%default].")

    parser.add_option("-f",
                      "--fdr-control",
                      dest="fdr_control",
                      type="choice",
                      choices=("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default].")

    parser.add_option("-m",
                      "--motif",
                      dest="motif",
                      type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default].")

    parser.add_option(
        "-a",
        "--arrangements",
        dest="arrangements",
        type="string",
        help="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x",
                      "--mask",
                      dest="mask",
                      type="choice",
                      choices=("dust", "repeatmasker"),
                      help="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats",
                      dest="output_stats",
                      action="store_true",
                      help="output stats [default=%default].")

    parser.add_option("--add-sequence",
                      dest="add_sequence",
                      action="store_true",
                      help="add sequence information [default=%default].")

    parser.set_defaults(
        iterations=100,
        qvalue_threshold=0.05,
        motif="rxrvdr",
        fdr_control="all",
        combine=True,
        arrangements=None,
        mask=None,
        output_stats=False,
        add_sequence=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements is None:
        options.arrangements = ["DR%s" % x for x in range(0, 15)
                                ] + ["ER%s" % x for x in range(0, 15)]
    else:
        options.arrangements = options.arrangements.split(",")

    options.stdout.write("%s" % "\t".join(Nubiscan.NubiscanMatch._fields))
    if options.add_sequence:
        options.stdout.write("\tsequence")
    options.stdout.write("\n")

    if options.motif == 'nr':
        sense_matrix = NR
    elif options.motif == "rxrvdr":
        sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1":
        sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2":
        sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences([x.sequence for x in seqs],
                                        options.mask)
        else:
            masked_seqs = [x.sequence for x in seqs]

        ninput = len(seqs)
        map_id2title = dict(
            enumerate([re.sub("\s.*", "", x.title) for x in seqs]))
        matcher = Nubiscan.MatcherRandomisationSequences(
            sense_matrix, samples=options.iterations)

        results = matcher.run(masked_seqs,
                              options.arrangements,
                              qvalue_threshold=options.qvalue_threshold)

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        for r in results:

            if r.alternatives:
                alternatives = ",".join(
                    [x.arrangement for x in r.alternatives])
            else:
                alternatives = ""

            options.stdout.write("\t".join(
                (map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand,
                 r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore,
                 "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives)))

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-":
                    s = Genomics.complement(s)
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write("\t%s" % s)

            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile("fdr")
            outfile.write("bin\thist\tnobserved\n")
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist,
                                       matcher.nobservations):
                outfile.write("%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()

    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence(
            sense_matrix, samples=options.iterations)

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run(seq.sequence,
                             options.arrangements,
                             qvalue_threshold=None)
            for m in mm:
                matches.append(m._replace(sequence=seq.title))

        # estimate qvalues for all matches across all sequences
        pvalues = [x.pvalue for x in matches]
        fdr = Stats.doFDR(pvalues)
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold:
                continue
            results.append(m._replace(qvalue=qvalue))

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        # output
        for r in results:
            options.stdout.write("\t".join(
                (r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement,
                 "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue,
                 "%6.4e" % r.qvalue)) + "\n")

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence(
            sense_matrix, samples=options.iterations)

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run(seq.sequence,
                                 options.arrangements,
                                 qvalue_threshold=options.qvalue_threshold)

            if options.combine:
                result = Nubiscan.combineMotifs(result)

            t = re.sub(" .*", "", seq.title)
            for r in result:
                options.stdout.write("\t".join(
                    (t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement,
                     "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue,
                     "%f" % r.qvalue)) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
コード例 #16
0
ファイル: GO.py プロジェクト: CGATOxford/cgat
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info,
                         options):
    '''compute pairwise enrichment between sets.

    The purpose of this method is to find if there are categories that are differently enriched
    in a pair of gene lists.

    The appropriate test here is the Chi-Squared test. 

    The assumption is that the background set is the same in all gene lists.

    The workflow is thus::

       for each combination of two gene lists:
           for each GO category:
               get counts in foreground, total counts of foreground
               compute chi-square enrichment output
               save P-value
           apply fdr - output significant differences.
    '''

    dicts = [dict(x) for x in results_per_genelist]

    PairResult = collections.namedtuple("PairResult",
                                        "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description")

    outfile = getFileName(options,
                          go=test_ontology,
                          section='summary',
                          set="pairs")

    outfile.write(
        "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n")

    results = []

    total = len(dicts) * (len(dicts) - 1) / 2

    iteration = 0

    min_observed_counts = options.pairs_min_observed_counts

    for x, genelist1 in enumerate(sorted(dicts)):

        x_go_categories = set(genelist1.keys())
        for y, genelist2 in enumerate(sorted(dicts[:x])):

            iteration += 1
            if iteration % 10 == 0:
                E.info("iteration: %i/%i (%5.2f%%)" %
                       (iteration, total, 100.0 * iteration / total))

            y_go_categories = set(genelist2.keys())

            shared = x_go_categories.intersection(y_go_categories)

            c = E.Counter()

            for category in shared:
                c.shared += 1
                xx = genelist1[category]
                yy = genelist2[category]

                # discard all tests with few observations in the observed
                # counts
                if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts:
                    c.skipped += 1
                    continue

                observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory)

                aa, bb, cc, dd = \
                    (xx.mSampleCountsCategory,
                     yy.mSampleCountsCategory,
                     xx.mSampleCountsTotal - xx.mSampleCountsCategory,
                     yy.mSampleCountsTotal - yy.mSampleCountsCategory)

                if cc == dd == 0:
                    c.skipped += 1
                    continue

                c.tested += 1

                fisher, pvalue = scipy.stats.fisher_exact(numpy.array(
                    ((aa, bb),
                     (cc, dd))))

                if pvalue < 0.05:
                    c.significant_pvalue += 1
                else:
                    c.insignificant_pvalue += 1

                results.append(PairResult._make((category,
                                                 labels[x],
                                                 labels[y],
                                                 xx.mSampleCountsCategory,
                                                 xx.mSampleCountsTotal,
                                                 xx.mPValue,
                                                 xx.mQValue,
                                                 yy.mSampleCountsCategory,
                                                 yy.mSampleCountsTotal,
                                                 yy.mPValue,
                                                 yy.mQValue,
                                                 pvalue,
                                                 1.0,
                                                 go2info[category].mDescription)))

            outfile.write("\t".join(map(str,
                                        (labels[x], labels[y],
                                         len(x_go_categories),
                                         len(y_go_categories),
                                         c.shared,
                                         c.skipped,
                                         c.tested,
                                         c.significant_pvalue,
                                         c.insignicant_pvalue))) + "\n")
    if options.output_filename_pattern:
        outfile.close()

    if options.fdr:
        pvalues = [x.pvalue for x in results]

        if options.qvalue_method == "storey":

            # compute fdr via Storey's method
            try:
                fdr_data = Stats.doFDR(pvalues)

            except ValueError as msg:
                E.warn("failure in q-value computation: %s" % msg)
                E.warn("reverting to Bonferroni correction")
                method = "bonf"
                fdr_data = Stats.FDRResult()
                l = float(len(pvalues))
                fdr_data.mQValues = [min(1.0, x * l) for x in pvalues]

            qvalues = fdr_data.mQValues
        else:
            qvalues = R['p.adjust'](pvalues, method=options.qvalue_method)

        # update qvalues
        results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)]

    outfile = getFileName(options,
                          go=test_ontology,
                          section='pairs',
                          set="pairs")

    outfile.write("\t".join(PairResult._fields) + "\n")
    for result in results:
        outfile.write("\t".join(map(str, result)) + "\n")

    if options.output_filename_pattern:
        outfile.close()
コード例 #17
0
ファイル: GO.py プロジェクト: CGATOxford/cgat
def computeFDRs(go_results,
                foreground,
                background,
                options,
                test_ontology,
                gene2go,
                go2info):

    pairs = sorted(go_results.mResults.items())

    E.info("calculating the FDRs using method `%s`" % options.qvalue_method)

    samples = None

    observed_min_pvalues = [min(x[1].mProbabilityOverRepresentation,
                                x[1].mProbabilityUnderRepresentation) for x in pairs]

    fdrs = {}

    method = options.qvalue_method

    if options.qvalue_method == "storey":

        # compute fdr via Storey's method
        try:
            fdr_data = Stats.doFDR(observed_min_pvalues)

        except ValueError as msg:
            E.warn("failure in q-value computation: %s" % msg)
            E.warn("reverting to Bonferroni correction")
            method = "bonf"
            fdr_data = Stats.FDRResult()
            l = float(len(observed_min_pvalues))
            fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues]

        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"

        #######################################################################
        # sampling
        # for each GO-category:
        # get maximum and minimum counts in x samples -> calculate minimum/maximum significance
        # get average and stdev counts in x samples -> calculate z-scores for
        # test set
        samples, simulation_min_pvalues = getSamples(gene2go,
                                                     foreground,
                                                     background,
                                                     options,
                                                     test_ontology,
                                                     go2info)

        # compute P-values from sampling
        observed_min_pvalues.sort()
        observed_min_pvalues = numpy.array(observed_min_pvalues)

        sample_size = options.sample

        for k, v in pairs:

            if k in samples:
                s = samples[k]
            else:
                raise KeyError("category %s not in samples" % k)

            # calculate values for z-score
            if s.mStddev > 0:
                zscore = abs(
                    float(v.mSampleCountsCategory) - s.mMean) / s.mStddev
            else:
                zscore = 0.0

            #############################################################
            # FDR:
            # For each p-Value p at node n:
            #   a = average number of nodes in each simulation run with P-Value < p
            #           this can be obtained from the array of all p-values and all nodes
            #           simply divided by the number of samples.
            #      aka: expfpos=experimental false positive rate
            #   b = number of nodes in observed data, that have a P-Value of less than p.
            #      aka: pos=positives in observed data
            #   fdr = a/b
            pvalue = v.mPValue

            # calculate values for FDR:
            # nfdr = number of entries with P-Value better than node.
            a = 0
            while a < len(simulation_min_pvalues) and \
                    simulation_min_pvalues[a] < pvalue:
                a += 1
            a = float(a) / float(sample_size)
            b = 0
            while b < len(observed_min_pvalues) and \
                    observed_min_pvalues[b] < pvalue:
                b += 1

            if b > 0:
                fdr = min(1.0, float(a) / float(b))
            else:
                fdr = 1.0

            fdrs[k] = (fdr, a, b)
    else:
        qvalues = R['p.adjust'](
            observed_min_pvalues, method=options.qvalue_method)
        fdr_data = Stats.FDRResult()
        fdr_data.mQValues = list(qvalues)
        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    return fdrs, samples, method
コード例 #18
0
ファイル: GO.py プロジェクト: wangdi2014/cgat
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info,
                         options):
    '''compute pairwise enrichment between sets.

    The purpose of this method is to find if there are categories that are differently enriched
    in a pair of gene lists.

    The appropriate test here is the Chi-Squared test.

    The assumption is that the background set is the same in all gene lists.

    The workflow is thus::

       for each combination of two gene lists:
           for each GO category:
               get counts in foreground, total counts of foreground
               compute chi-square enrichment output
               save P-value
           apply fdr - output significant differences.
    '''

    dicts = [dict(x) for x in results_per_genelist]

    PairResult = collections.namedtuple(
        "PairResult",
        "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description"
    )

    outfile = getFileName(options,
                          go=test_ontology,
                          section='summary',
                          set="pairs")

    outfile.write(
        "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n"
    )

    results = []

    total = len(dicts) * (len(dicts) - 1) / 2

    iteration = 0

    min_observed_counts = options.pairs_min_observed_counts

    for x, genelist1 in enumerate(sorted(dicts)):

        x_go_categories = set(genelist1.keys())
        for y, genelist2 in enumerate(sorted(dicts[:x])):

            iteration += 1
            if iteration % 10 == 0:
                E.info("iteration: %i/%i (%5.2f%%)" %
                       (iteration, total, 100.0 * iteration / total))

            y_go_categories = set(genelist2.keys())

            shared = x_go_categories.intersection(y_go_categories)

            c = E.Counter()

            for category in shared:
                c.shared += 1
                xx = genelist1[category]
                yy = genelist2[category]

                # discard all tests with few observations in the observed
                # counts
                if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts:
                    c.skipped += 1
                    continue

                observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory)

                aa, bb, cc, dd = \
                    (xx.mSampleCountsCategory,
                     yy.mSampleCountsCategory,
                     xx.mSampleCountsTotal - xx.mSampleCountsCategory,
                     yy.mSampleCountsTotal - yy.mSampleCountsCategory)

                if cc == dd == 0:
                    c.skipped += 1
                    continue

                c.tested += 1

                fisher, pvalue = scipy.stats.fisher_exact(
                    numpy.array(((aa, bb), (cc, dd))))

                if pvalue < 0.05:
                    c.significant_pvalue += 1
                else:
                    c.insignificant_pvalue += 1

                results.append(
                    PairResult._make(
                        (category, labels[x], labels[y],
                         xx.mSampleCountsCategory, xx.mSampleCountsTotal,
                         xx.mPValue, xx.mQValue, yy.mSampleCountsCategory,
                         yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue,
                         1.0, go2info[category].mDescription)))

            outfile.write("\t".join(
                map(str, (labels[x], labels[y], len(x_go_categories),
                          len(y_go_categories), c.shared, c.skipped, c.tested,
                          c.significant_pvalue, c.insignicant_pvalue))) + "\n")
    if options.output_filename_pattern:
        outfile.close()

    if options.fdr:
        pvalues = [x.pvalue for x in results]

        if options.qvalue_method == "storey":

            # compute fdr via Storey's method
            try:
                fdr_data = Stats.doFDR(pvalues)

            except ValueError as msg:
                E.warn("failure in q-value computation: %s" % msg)
                E.warn("reverting to Bonferroni correction")
                method = "bonf"
                fdr_data = Stats.FDRResult()
                l = float(len(pvalues))
                fdr_data.mQValues = [min(1.0, x * l) for x in pvalues]

            qvalues = fdr_data.mQValues
        else:
            qvalues = R['p.adjust'](pvalues, method=options.qvalue_method)

        # update qvalues
        results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)]

    outfile = getFileName(options,
                          go=test_ontology,
                          section='pairs',
                          set="pairs")

    outfile.write("\t".join(PairResult._fields) + "\n")
    for result in results:
        outfile.write("\t".join(map(str, result)) + "\n")

    if options.output_filename_pattern:
        outfile.close()
コード例 #19
0
ファイル: go2svg.py プロジェクト: CGATOxford/cgat
def Collect(infile,
            with_headers=False,
            annotator_format=False,
            use_annotator_fdr=False,
            delims="",
            ignore="",
            max_pvalue=1.0,
            max_qvalue=None):
    """read input table."""

    data = []

    lines = [x for x in infile.readlines() if x[0] != "#"]

    if len(lines) == 0:
        return data

    if with_headers:
        del lines[0]

    if annotator_format:

        lines = [line for line in lines if not line.startswith("Iteration")]
        annotator_fdr = {}
        annotator_level = None
        for line in lines:
            if len(line) == 1:
                continue                  # skip trailing blank lines

            if line.startswith("--"):
                if line.startswith("-- False"):
                    annotator_level = float(
                        re.search("-- False Discovery summary for p-value (.+):", line).groups()[0])
                    annotator_fdr[annotator_level] = {}
                elif line.startswith("--  Category"):
                    pass
                else:
                    if re.search("insufficiently", line):
                        continue
                    dd = re.split("\s+", line[4:-1])
                    d = DataFDR()
                    d.mObserved, d.mAverage, d.mMedian, d.m95 = list(map(
                        float, dd[1:]))
                    annotator_fdr[annotator_level][dd[0]] = d
                continue
            else:
                if line[0] == "Z":
                    continue  # skip header
                if len(line[:-1].split('\t')) != 9:
                    continue  # HACK: accounts for a bug in Annotator output

                try:
                    (z, percentchange, pvalue, observed, expected, low95,
                     up95, stddev, description) = line[:-1].split('\t')[:9]
                except ValueError:
                    raise ValueError("# parsing error in line: %s" % line[:-1])

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(pvalue)
            d.mFoldChange = 1.0 + float(percentchange) / 100.0
            data.append(d)
    else:

        for line in lines:
            try:
                (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio,
                 pover, punder, goid, category, description) = line[:-1].split("\t")[:14]
            except ValueError:
                raise ValueError("# parsing error in line: %s" % line[:-1])

            if code == "+":
                p = pover
            else:
                p = punder

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(p)
            d.mFoldChange = float(spercent) / float(bpercent)
            data.append(d)

    # apply filters
    for c in delims:
        for d in data:
            d.mAnnotation = d.mAnnotation.split(c)[0]
    for c in ignore:
        for d in data:
            d.mAnnotation = d.mAnnotation.replace(c, '')

    ninput = len(data)
    no_fdr = False
    # apply filters

    if ninput > 0:
        if max_qvalue is not None:
            if use_annotator_fdr:
                pvalues = list(annotator_fdr.keys())
                pvalues.sort()
                pvalues.reverse()
                for pvalue in pvalues:
                    try:
                        d = annotator_fdr[pvalue]["Significant"]
                    except KeyError:
                        continue
                    if d.mObserved == 0:
                        E.info("no data remaining after fdr filtering")
                        data = []
                        break
                    elif d.mAverage / d.mObserved < max_qvalue:
                        E.info("filtering with P-value of %f" % pvalue)
                        data = [x for x in data if x.mPValue < pvalue]
                        break
                else:
                    E.warn("fdr could not be computed - compute more "
                           "samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

            if no_fdr:
                if use_annotator_fdr:
                    E.info("estimating FDR from observed P-Values")

                pvalues = [x.mPValue for x in data]
                vlambda = numpy.arange(0, max(pvalues), 0.05)
                try:
                    qvalues = Stats.doFDR(
                        pvalues, vlambda=vlambda, fdr_level=max_qvalue)
                except ValueError as msg:
                    E.warn(
                        "fdr could not be computed - no filtering: %s" % msg)
                    no_fdr = True
                else:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
        elif max_pvalue is not None:
            data = [x for x in data if x.mPValue < max_pvalue]

    if no_fdr:
        data = []

    nremoved = ninput - len(data)

    return data, nremoved, no_fdr
コード例 #20
0
ファイル: go2svg.py プロジェクト: gsc0107/cgat
def Collect(infile,
            with_headers=False,
            annotator_format=False,
            use_annotator_fdr=False,
            delims="",
            ignore="",
            max_pvalue=1.0,
            max_qvalue=None):
    """read input table."""

    data = []

    lines = [x for x in infile.readlines() if x[0] != "#"]

    if len(lines) == 0:
        return data

    if with_headers:
        del lines[0]

    if annotator_format:

        lines = [line for line in lines if not line.startswith("Iteration")]
        annotator_fdr = {}
        annotator_level = None
        for line in lines:
            if len(line) == 1:
                continue  # skip trailing blank lines

            if line.startswith("--"):
                if line.startswith("-- False"):
                    annotator_level = float(
                        re.search(
                            "-- False Discovery summary for p-value (.+):",
                            line).groups()[0])
                    annotator_fdr[annotator_level] = {}
                elif line.startswith("--  Category"):
                    pass
                else:
                    if re.search("insufficiently", line):
                        continue
                    dd = re.split("\s+", line[4:-1])
                    d = DataFDR()
                    d.mObserved, d.mAverage, d.mMedian, d.m95 = list(
                        map(float, dd[1:]))
                    annotator_fdr[annotator_level][dd[0]] = d
                continue
            else:
                if line[0] == "Z":
                    continue  # skip header
                if len(line[:-1].split('\t')) != 9:
                    continue  # HACK: accounts for a bug in Annotator output

                try:
                    (z, percentchange, pvalue, observed, expected, low95, up95,
                     stddev, description) = line[:-1].split('\t')[:9]
                except ValueError:
                    raise ValueError("# parsing error in line: %s" % line[:-1])

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(pvalue)
            d.mFoldChange = 1.0 + float(percentchange) / 100.0
            data.append(d)
    else:

        for line in lines:
            try:
                (code, goid, scount, stotal, spercent, bcount, btotal,
                 bpercent, ratio, pover, punder, goid, category,
                 description) = line[:-1].split("\t")[:14]
            except ValueError:
                raise ValueError("# parsing error in line: %s" % line[:-1])

            if code == "+":
                p = pover
            else:
                p = punder

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(p)
            d.mFoldChange = float(spercent) / float(bpercent)
            data.append(d)

    # apply filters
    for c in delims:
        for d in data:
            d.mAnnotation = d.mAnnotation.split(c)[0]
    for c in ignore:
        for d in data:
            d.mAnnotation = d.mAnnotation.replace(c, '')

    ninput = len(data)
    no_fdr = False
    # apply filters

    if ninput > 0:
        if max_qvalue is not None:
            if use_annotator_fdr:
                pvalues = list(annotator_fdr.keys())
                pvalues.sort()
                pvalues.reverse()
                for pvalue in pvalues:
                    try:
                        d = annotator_fdr[pvalue]["Significant"]
                    except KeyError:
                        continue
                    if d.mObserved == 0:
                        E.info("no data remaining after fdr filtering")
                        data = []
                        break
                    elif d.mAverage / d.mObserved < max_qvalue:
                        E.info("filtering with P-value of %f" % pvalue)
                        data = [x for x in data if x.mPValue < pvalue]
                        break
                else:
                    E.warn("fdr could not be computed - compute more "
                           "samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

            if no_fdr:
                if use_annotator_fdr:
                    E.info("estimating FDR from observed P-Values")

                pvalues = [x.mPValue for x in data]
                vlambda = numpy.arange(0, max(pvalues), 0.05)
                try:
                    qvalues = Stats.doFDR(pvalues,
                                          vlambda=vlambda,
                                          fdr_level=max_qvalue)
                except ValueError as msg:
                    E.warn("fdr could not be computed - no filtering: %s" %
                           msg)
                    no_fdr = True
                else:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
        elif max_pvalue is not None:
            data = [x for x in data if x.mPValue < max_pvalue]

    if no_fdr:
        data = []

    nremoved = ninput - len(data)

    return data, nremoved, no_fdr
コード例 #21
0
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps,
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t,
                    %(tablename_map)s as map,
                    annotations.protein_stats as s,
                    annotations.transcript_info as i
        WHERE map.snp_id = t.snp_id AND
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length)
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s
                   WHERE s.protein_id = i.protein_id
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join([str(int(x < fdr))
                        for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % int(nsnps),
                              "%i" % int(ndel),
                              "%6.4f" % del_p,
                              "%6.4g" % del_pvalue,
                              "%6.4g" % del_qvalue,
                              "%6.4f" % len_p,
                              "%6.4g" % len_pvalue,
                              "%6.4g" % len_qvalue,
                              "%6.4f" % com_p,
                              "%6.4g" % com_pvalue,
                              "%6.4g" % com_qvalue,
                              )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % 0,
                              "%i" % 0,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              )) + "\n")

    outf.close()
コード例 #22
0
def loadGOs( infiles, outfile, tablename ):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists( indir ):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile ).groups()

        for filename in glob.glob( os.path.join(indir, "*.overall") ):
            for line in open(filename, "r" ):
                if line.startswith("#"): continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header: continue
                    tempf1.write( "track\tgeneset\tannotationset\t%s" % line )
                    header = True
                    assert data[10] == "pover" and data[11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write( "%s\t%s\t%s\t%s" % (track, geneset, annotationset, line) )
                pvalues.append( min( float(data[10]), float(data[11]) ) )

    tempf1.close()

    E.info( "analysing %i pvalues" % len(pvalues ))
    fdr = Stats.doFDR( pvalues )
    E.info( "got %i qvalues" % len(fdr.mQValues ))
    qvalues = ["global_qvalue" ] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip( open(tempf1.name,"r"), qvalues ):
        tempf2.write( "%s\t%s\n" % (line[:-1], str(qvalue)) )

    tempf2.close()
    tempfilename = tempf2.name
    print tempf1.name
    print tempf2.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty 
              --index=category 
              --index=track,geneset,annotationset
              --index=geneset
              --index=annotationset
              --index=goid 
              --table=%(tablename)s 
    < %(tempfilename)s
    > %(outfile)s
    '''
    P.run()