Exemple #1
0
def readAndGroupTable( infile, options ):
    """read table from infile and group.
    """
    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )
    options.columns = getColumns( fields, options.columns )
    assert options.group_column not in options.columns

    converter = float
    new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce( lambda x,y: x+y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join( [ y for y in x if y != "" ] )
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join( [ y for y in set(x) if y != "" ] )
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [ fields[options.group_column] ]
        for c in options.columns:
            new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) )

    ## convert values to floats (except for group_column)
    ## Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [ row[options.group_column] ]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append( converter(row[c]) )
                except ValueError:
                    skip = True
                    break
        if not skip: new_table.append(new_row)
    table = new_table

    new_rows = CSV.GroupTable( table,
                               group_column = 0,
                               group_function = f )

    options.stdout.write("\t".join(new_fields) + "\n")        
    for row in new_rows:
        options.stdout.write( "\t".join( map(str,row) ) + "\n")
Exemple #2
0
    def printHeightsPerTree(values, section, options, prefix_header,
                            prefix_row):

        if not values: return

        outfile, is_new = TreeReconciliation.getFile(options, section)
        if is_new:
            outfile.write("%s%s\theights\n" % (prefix_header, "\t".join(
                Stats.DistributionalParameters().getHeaders())))

        s = Stats.DistributionalParameters(values)
        s.setFormat(options.format_branch_length)
        outfile.write("%s%s\t%s\n" % (prefix_row, str(s), ",".join(
            map(lambda x: options.format_branch_length % x, values))))
Exemple #3
0
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(self.mPvalues,
                             fdr_level=0.05,
                             pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
Exemple #4
0
    def checkFDR(self, **kwargs):

        old = Stats.doFDR(self.pvalues, **kwargs)
        # print old.mQValues[:10]
        # print old.mPi0
        new = Stats.doFDRPython(self.pvalues, **kwargs)
        # print new.mQValues[:10]
        # print new.mPi0
        # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3)
        self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error)

        for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues):
            self.assertTrue(
                getRelativeError(a, b) < self.max_error,
                "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" %
                (getRelativeError(a, b), self.max_error, pvalue, a, b))
Exemple #5
0
    def __call__(self, track, slice=None):

        result = odict()

        merged = None
        rocs = []

        for field in self.mFields:
            data = []
            for replicate in EXPERIMENTS.getTracks(track):
                statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals()
                data.append(self.get(statement))

            idx = []
            for x in range(len(data)):
                i = IndexedGenome.IndexedGenome()
                for contig, start, end, peakval in data[x]:
                    i.add(contig, start, end, peakval)
                idx.append(i)

            def _iter(all):
                all.sort()
                last_contig, first_start, last_end, last_value = all[0]
                for contig, start, end, value in all[1:]:
                    if contig != last_contig or last_end < start:
                        yield (last_contig, first_start, last_end)
                        last_contig, first_start, last_end = contig, start, end
                    else:
                        last_end = max(last_end, end)
                yield (last_contig, first_start, last_end)

            if not merged:
                all = [x for x in itertools.chain(*data)]
                merged = list(_iter(all))

            roc_data = []
            for contig, start, end in merged:
                intervals = []
                for i in idx:
                    try:
                        intervals.append(list(i.get(contig, start, end)))
                    except KeyError:
                        continue

                if len(intervals) == 0:
                    continue

                is_repro = len([x for x in intervals if x != []]) == len(data)
                value = max([x[2] for x in itertools.chain(*intervals)])

                # fpr, tpr
                roc_data.append((value, is_repro))

            roc_data.sort()
            roc_data.reverse()

            roc = zip(*Stats.computeROC(roc_data))
            result[field] = odict((("FPR", roc[0]), (field, roc[1])))

        return result
Exemple #6
0
 def check(self, method):
     '''check for length equality and elementwise equality.'''
     a = R['p.adjust'](self.pvalues, method=method)
     b = Stats.adjustPValues(self.pvalues, method=method)
     self.assertEqual(len(a), len(b))
     for x, y in zip(a, b):
         self.assertAlmostEqual(x, y)
def computeFDR(all_results,
               qvalue_method="storey"):
    '''compute FDR.

    update GOResult structure with field .fdr
    '''

    # flatten all_results
    results = []
    for key, data in all_results.iteritems():
        results.extend(data.mResults.values())

    observed_min_pvalues = [min(
        x.mProbabilityOverRepresentation,
        x.mProbabilityUnderRepresentation) for x in results]

    if qvalue_method == "storey":

        # compute fdr via Storey's method
        fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1)

        E.info("estimated proportion of true null hypotheses = %6.4f" %
               fdr_data.mPi0)

        if fdr_data.mPi0 < 0.1:
            E.warn(
                "estimated proportion of true null hypotheses is "
                "less than 10%%  (%6.4f)" % fdr_data.mPi0)

        for result, qvalue in zip(results, fdr_data.mQValues):
            result.fdr = qvalue

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"
        raise NotImplementedError("empirical needs some work")
Exemple #8
0
def doOldFDR(options, args):
    """apply fdr to output of annotator."""

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    # apply filters and create diagnostic plots
    for filename, data in zip(args, annotators):
        ninput = len(data)
        pvalues = [x.mPValue for x in data]
        vlambda = numpy.arange(0, max(pvalues), 0.05)
        try:
            qvalues = Stats.doFDR(
                pvalues, vlambda=vlambda, fdr_level=options.fdr)
        except ValueError, msg:
            E.warn("%s: fdr could not be computed - no filtering: %s" %
                   (filename, msg))
            continue

        qvalues.plot(filename + "_diagnostics.png")

        data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
Exemple #9
0
    def __call__(self, track, slice = None ):

        result = odict()

        merged = None
        rocs = []

        for field in self.mFields:
            data = []
            for replicate in EXPERIMENTS.getTracks( track ):
                statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals()
                data.append( self.get( statement) )

            idx = []
            for x in range(len(data)):
                i = IndexedGenome.IndexedGenome()
                for contig, start, end, peakval in data[x]:
                    i.add( contig, start, end, peakval )
                idx.append( i )

            def _iter( all ):
                all.sort()
                last_contig, first_start, last_end, last_value = all[0]
                for contig, start, end, value in all[1:]:
                    if contig != last_contig or last_end < start:
                        yield (last_contig, first_start, last_end) 
                        last_contig, first_start, last_end = contig, start, end
                    else:
                        last_end = max(last_end, end )
                yield (last_contig, first_start, last_end) 

            if not merged:
                all =  [ x for x in itertools.chain( *data ) ]
                merged = list( _iter(all) )

            roc_data = []
            for contig, start, end in merged:
                intervals = []
                for i in idx:
                    try:
                        intervals.append( list(i.get( contig, start, end )) )
                    except KeyError:
                        continue

                if len(intervals) == 0:
                    continue

                is_repro = len( [ x for x in intervals if x != [] ] ) == len(data)
                value = max( [ x[2] for x in itertools.chain( *intervals )] )

                # fpr, tpr
                roc_data.append( (value, is_repro) )

            roc_data.sort()
            roc_data.reverse()
            
            roc = zip(*Stats.computeROC( roc_data ))
            result[field] = odict( (("FPR", roc[0]), (field,roc[1])) )
            
        return result
Exemple #10
0
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(
            self.mPvalues, fdr_level=0.05, pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
Exemple #11
0
def computeFDR(all_results, qvalue_method="storey"):
    '''compute FDR.

    update GOResult structure with field .fdr
    '''

    # flatten all_results
    results = []
    for key, data in all_results.iteritems():
        results.extend(data.mResults.values())

    observed_min_pvalues = [
        min(x.mProbabilityOverRepresentation,
            x.mProbabilityUnderRepresentation) for x in results
    ]

    if qvalue_method == "storey":

        # compute fdr via Storey's method
        fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1)

        E.info("estimated proportion of true null hypotheses = %6.4f" %
               fdr_data.mPi0)

        if fdr_data.mPi0 < 0.1:
            E.warn("estimated proportion of true null hypotheses is "
                   "less than 10%%  (%6.4f)" % fdr_data.mPi0)

        for result, qvalue in zip(results, fdr_data.mQValues):
            result.fdr = qvalue

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"
        raise NotImplementedError("empirical needs some work")
Exemple #12
0
 def check(self, method):
     '''check for length equality and elementwise equality.'''
     a = R['p.adjust'](self.pvalues, method=method)
     b = Stats.adjustPValues(self.pvalues, method=method)
     self.assertEqual(len(a), len(b))
     for x, y in zip(a, b):
         self.assertAlmostEqual(x, y)
Exemple #13
0
    def testLRT(self):
        """test that the false positive rate is in the same order as mSignificance.

        Sample from a normal distribution and compare two models:

        1. mean estimated = complex model (1 df)
        2. mean given     = simple model  (0 df)

        Likelihood = P(model | data)
        """
        simple_np = 0
        complex_np = 1

        npassed = 0

        for replicate in range(0, self.mNumReplicates):
            sample = scipy.stats.norm.rvs(
                size=self.mNumSamples, loc=0.0, scale=1.0)
            mean = scipy.mean(sample)

            complex_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0)))
            simple_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0)))

            a = Stats.doLogLikelihoodTest(complex_ll, complex_np,
                                          simple_ll, simple_np,
                                          significance_threshold=self.mSignificance)

            if a.mPassed:
                npassed += 1

        r = float(npassed) / self.mNumReplicates

        self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
Exemple #14
0
    def process( self, contig, start, end, reads, qualities ):

        self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, 
                                                                   contig, start, end, end - start, 
                                                                   len(reads),
                                                                   len(qualities),
                                                                   str(Stats.DistributionalParameters( qualities ) )))
Exemple #15
0
    def process(self, contig, start, end, reads, qualities):

        aligned = filter(lambda x: x > 0, reads)
        self.mOutFile.write(
            "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" %
            (self.mOutputId, contig, start, end, end - start, len(reads),
             len(aligned), str(Stats.DistributionalParameters(aligned))))
Exemple #16
0
    def checkFDR(self, **kwargs):

        old = Stats.doFDR(self.pvalues, **kwargs)
        # print old.mQValues[:10]
        # print old.mPi0
        new = Stats.doFDRPython(self.pvalues, **kwargs)
        # print new.mQValues[:10]
        # print new.mPi0
        # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3)
        self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error)

        for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues):
            self.assertTrue(getRelativeError(a, b) < self.max_error,
                            "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" %
                            (getRelativeError(a, b),
                             self.max_error,
                             pvalue, a, b))
Exemple #17
0
    def process(self, contig, start, end, reads, qualities):

        aligned = [x for x in reads if x > 0]
        self.mOutFile.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId,
                                                                  contig, start, end, end -
                                                                  start,
                                                                  len(reads),
                                                                  len(aligned),
                                                                  str(Stats.DistributionalParameters(aligned))))
Exemple #18
0
def writeResults(outfile, results):
    fields = ("wall", "user", "sys", "cuser", "csys", "nchunks")

    outfile.write("host\t%s\n" % "\t".join([
        "%s_%s" % (x, y)
        for x, y in itertools.product(fields,
                                      Stats.Summary().getHeaders())
    ]))

    hosts = results.keys()
    hosts.sort()

    for host in hosts:
        result = results[host]
        outfile.write("%s" % host)
        for f in fields:
            d = [y.__getitem__(f) for y in result]
            outfile.write("\t%s" % Stats.Summary(d))
        outfile.write("\n")
Exemple #19
0
    def printHeightsPerSpecies(values, section, options, prefix_header,
                               prefix_row):

        if not values: return

        ## distributions of distance to node
        outfile, is_new = TreeReconciliation.getFile(options, section)
        if is_new:
            outfile.write("%sspecies\t%s\theights\n" %
                          (prefix_header, "\t".join(
                              Stats.DistributionalParameters().getHeaders())))

        for species in sorted(values.keys()):
            s = Stats.DistributionalParameters(values[species])
            s.setFormat(options.format_branch_length)
            outfile.write("%s%s\t%s\t%s\n" %
                          (prefix_row, species, str(s), ",".join(
                              map(lambda x: options.format_branch_length % x,
                                  values[species]))))
Exemple #20
0
    def testAgainstQValue(self):

        R.assign("pvalues", self.pvalues)
        qvalue = R('''qvalue( pvalues )''')
        r_qvalues = qvalue[2]
        r_pi0 = qvalue[1][0]

        new = Stats.doFDRPython(self.pvalues)
        self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error)

        for a, b in zip(r_qvalues, new.mQValues):
            self.assertAlmostEqual(a, b, places=self.nplaces)
Exemple #21
0
    def testAgainstQValue(self):

        R.assign("pvalues", self.pvalues)
        qvalue = R('''qvalue( pvalues )''')
        r_qvalues = qvalue[2]
        r_pi0 = qvalue[1][0]

        new = Stats.doFDRPython(self.pvalues)
        self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error)

        for a, b in zip(r_qvalues, new.mQValues):
            self.assertAlmostEqual(a, b, places=self.nplaces)
Exemple #22
0
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(str, (
                len(self.counts_gene_ids),
                len(self.counts_transcript_ids),
                single_exon_transcripts,
                Stats.Summary(exons_per_transcript),
                Stats.Summary(exon_sizes),
                Stats.Summary(intron_sizes),
                Stats.Summary(transcript_lengths),
            )))
Exemple #23
0
    def process(self, contig, start, end, reads, qualities):

        entry = GTF.Entry()
        entry.start, entry.end = start, end
        entry.gene_id = self.mIdFormat % id
        entry.transcript_id = entry.gene_id
        entry.contig = contig
        entry.feature = "exon"
        entry.source = "maq"

        read_stats = Stats.Summary(reads)

        entry.score = "%5.2f" % read_stats['mean']

        self.mOutFile.write(str(entry) + "\n")
Exemple #24
0
    def testLRT(self):
        """test that the false positive rate is in the same order as mSignificance.

        Sample from a normal distribution and compare two models:

        1. mean estimated = complex model (1 df)
        2. mean given     = simple model  (0 df)

        Likelihood = P(model | data)
        """
        simple_np = 0
        complex_np = 1

        npassed = 0

        for replicate in range(0, self.mNumReplicates):
            sample = scipy.stats.norm.rvs(size=self.mNumSamples,
                                          loc=0.0,
                                          scale=1.0)
            mean = scipy.mean(sample)

            complex_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0)))
            simple_ll = numpy.sum(
                numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0)))

            a = Stats.doLogLikelihoodTest(
                complex_ll,
                complex_np,
                simple_ll,
                simple_np,
                significance_threshold=self.mSignificance)

            if a.mPassed:
                npassed += 1

        r = float(npassed) / self.mNumReplicates

        self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
Exemple #25
0
def decorator_max_score(values, start, end, contig):
    """compute minumum of values."""
    d = Stats.DistributionalParameters(values)
    return d['max'], str(d)
Exemple #26
0
                    reference_id = x - 1
                elif options.mode == "1xn":
                    reference_result = first_result
                    reference_id = 0

                if reference_result.mNumParameters >= result.mNumParameters:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "number of parameters of full model not increased (null=%i, full=%i).\n"
                            % (reference_result.mNumParameters,
                               result.mNumParameters))
                    continue

                lrt = Stats.doLogLikelihoodTest(
                    result.mLogLikelihood, result.mNumParameters,
                    reference_result.mLogLikelihood,
                    reference_result.mNumParameters,
                    options.significance_threshold)

                if lrt.mPassed:
                    c = "passed"
                else:
                    c = "failed"

                options.stdout.write("%s%i\t%i\t%s\t%f\t%i\t%f\t%i\t%5.2e\n" %
                                     (
                                         prefix_row,
                                         reference_id,
                                         x,
                                         c,
                                         lrt.mFullLogLikelihood,
            lnl_simple = float(row['%s:lnL' % b])
            df_complex = map_model2params[a]
            df_simple = map_model2params[b]
            if options.loglevel >= 3:
                options.stdlog.write("# testing %s: ll=%f,df=%i versus %s:lnl=%f,df=%i\n" %\
                                         (a,
                                          lnl_complex,df_complex, 
                                          b, lnl_simple,
                                          df_simple))

            if lnl_complex < lnl_simple:
                nerrors += 1
                options.stdout.write( "\tna\tna" )
                continue

            lrt = Stats.doLogLikelihoodTest( lnl_complex, df_complex, lnl_simple, df_simple )
            if lrt.mPassed: stats[(a,b)] += 1
            
            options.stdout.write( "\t%s\t%5.2e" % (
                    Stats.getSignificance( lrt.mProbability), 
                    lrt.mProbability ) )
            
        options.stdout.write( "\n" )

        noutput += 1

    options.stdout.write( "npassed" )
    for a, b in tests:
        options.stdout.write( "\t%i\t%5.2f" % (stats[(a, b)], 100.0 * stats[(a,b)] / noutput ) )
    options.stdout.write( "\n" )
Exemple #28
0
                
                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences ))
                                         
                npassed = 0
                
                for model in options.models:

                    sites = result.mSites[model]

                    ## do significance test
                    full_model, null_model = model, map_nested_models[model]
                    
                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood, 
                        result.mSites[full_model].mNumParameters, 
                        result.mSites[null_model].mLogLikelihood, 
                        result.mSites[null_model].mNumParameters, 
                        options.significance_threshold )

                    x = 0
                    for analysis in options.analysis:
                        
                        if analysis == "neb":
                            s = set(map( extract_f, filter( filter_f, sites.mNEB.mPositiveSites)))
                            
                        elif analysis == "beb":
                            s = set(map( extract_f, filter( filter_f, sites.mBEB.mPositiveSites)))                            
                            
                        options.stdout.write("\t%i" % ( len(s) ) )

                        if not lrt.mPassed:
Exemple #29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: table2table.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value",
                               "percentile", "remove-header", "normalize-by-table",
                               "upper-bound", "lower-bound", "kullback-leibler",
                               "expand", "compress", "fdr", "grep"),
                      help="""actions to perform on table.""")

    parser.add_option("-s", "--scale", dest="scale", type="float",
                      help="factor to scale matrix by.")

    parser.add_option("-f", "--format", dest="format", type="string",
                      help="output number format.")

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t", "--headers", dest="has_headers", action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--transpose", dest="transpose", action="store_true",
                      help="transpose table.")

    parser.add_option("--set-transpose-field", dest="set_transpose_field", type="string",
                      help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_option("--transpose-format", dest="transpose_format", type="choice",
                      choices=("default", "separated", ),
                      help="input format of un-transposed table")

    parser.add_option("--expand", dest="expand_table", action="store_true",
                      help="expand table - multi-value cells with be expanded over several rows.")

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("--columns", dest="columns", type="string",
                      help="columns to use.")

    parser.add_option("--file", dest="file", type="string",
                      help="columns to test from table.",
                      metavar="FILE")

    parser.add_option("-d", "--delimiter", dest="delimiter", type="string",
                      help="delimiter of columns.",
                      metavar="DELIM")

    parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true",
                      help="invert match.")

    parser.add_option("--sort-by-rows", dest="sort_rows", type="string",
                      help="output order for rows.")

    parser.add_option("-a", "--value", dest="value", type="float",
                      help="value to use for various algorithms.")

    parser.add_option("--group", dest="group_column", type="int",
                      help="group values by column. Supply an integer column [default=%default]")

    parser.add_option("--group-function", dest="group_function", type="choice",
                      choices=(
                          "min", "max", "sum", "mean", "stats", "cat", "uniq"),
                      help="function to group values by.")

    parser.add_option("--join-table", dest="join_column", type="int",
                      help="join rows in a table by columns.")

    parser.add_option("--collapse-table", dest="collapse_table", type="string",
                      help="collapse a table. Value determines the missing variable [%default].")

    parser.add_option("--join-column-name", dest="join_column_name", type="int",
                      help="use this column as a prefix.")

    parser.add_option("--flatten-table", dest="flatten_table", action="store_true",
                      help="flatten a table [%default].")

    parser.add_option("--as-column", dest="as_column", action="store_true",
                      help="output table as a single column.")

    parser.add_option("--split-fields", dest="split_fields", action="store_true",
                      help="split fields.")

    parser.add_option("--separator", dest="separator", type="string",
                      help="separator for multi-valued fields [default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=(
                          "BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--fdr-add-column", dest="fdr_add_column", type="string",
                      help="add new column instead of replacing existing columns. "
                      "The value of the option will be used as prefix if there are multiple columns [%default]")

    # IMS: add option to use a column as the row id in flatten
    parser.add_option("--id-column", dest="id_column", type="string",
                      help="list of column(s) to use as the row id when flattening the table. "
                      "If None, then row number is used. [default=%default].")

    parser.add_option("--variable-name", dest="variable_name", type="string",
                      help="the column header for the 'variable' column when flattening [default=%default].")

    parser.add_option("--value-name", dest="value_name", type="string",
                      help="the column header for the 'value' column when flattening [default=%default].")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format="%5.2f",
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    options.parameters = options.parameters.split(",")

    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if options.methods == ["remove-header"]:

        first = True
        for line in options.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            options.stdout.write(line)

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable(options.stdin, options)

    elif options.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying on a particular
        #     and adding custom column headings

        fields, table = CSV.ReadTable(
            options.stdin, with_header=options.has_headers, as_rows=True)

        options.columns = getColumns(fields, options.columns)

        if options.id_column:
            id_columns = map(
                lambda x: int(x) - 1, options.id_column.split(","))
            id_header = "\t".join([fields[id_column]
                                   for id_column in id_columns])
            options.columns = [
                x for x in options.columns if x not in id_columns]
        else:
            id_header = "row"

        options.stdout.write(
            "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name))

        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join([row[int(x) - 1]
                                    for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write(
                    "%s\t%s\t%s\n" % (row_id, fields[y], row[y]))

    elif options.as_column:

        fields, table = CSV.ReadTable(
            options.stdin, with_header=options.has_headers, as_rows=True)
        options.columns = getColumns(fields, options.columns)
        table = zip(*table)

        options.stdout.write("value\n")

        for column in options.columns:
            options.stdout.write("\n".join(table[column]) + "\n")

    elif options.split_fields:

        # split comma separated fields
        fields, table = CSV.ReadTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(options.separator) for x in row]
            for d in itertools.product(*row):
                options.stdout.write("%s\n" % "\t".join(d))

    elif options.group:
        readAndGroupTable(options.stdin, options)

    elif options.join_column:
        readAndJoinTable(options.stdin, options)

    elif options.expand_table:
        readAndExpandTable(options.stdin, options)

    elif options.collapse_table is not None:
        readAndCollapseTable(options.stdin, options, options.collapse_table)

    elif "grep" in options.methods:

        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

        patterns = []

        if options.file:
            infile = open(options.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(options.delimiter)[0])
        else:
            patterns = args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (found and not options.invert_match):
                print line[:-1]
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.ReadTable(
            options.stdin, with_header=options.has_headers, as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        options.columns = getColumns(fields, options.columns)

        # convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map(lambda x: x / value, table[c])

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map(lambda x: x * value, table[c])

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max(table[c])
                    table[c] = map(lambda x: x / m, table[c])

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                for x in range(0, len(options.columns) - 1):
                    for y in range(x + 1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2],
                                                                       options.format % e1,
                                                                       options.format % e2,
                                                                       options.format % ((e1 + e2) / 2)))
                E.Stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = zip(tt, range(nrows))
                    t.sort()
                    for i, n in zip(map(lambda x: x[1], t), range(nrows)):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % str(
                    max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % str(
                    min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = map(
                    str, Stats.adjustPValues(pvalues, method=options.fdr_method))

                if options.fdr_add_column is None:
                    x = 0
                    for c in options.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers

                    if len(options.columns) == 1:
                        fields.append(options.fdr_add_column)
                    else:
                        for co in options.columns:
                            fields.append(options.fdr_add_column + fields[c])

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table = CSV.ReadTable(
                    open(other_table_name, "r"),
                    with_header=options.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in options.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        # convert back
        for c in options.columns:
            for r in range(nrows):
                if isinstance(table[c][r], float):
                    table[c][r] = options.format % table[c][r]

        options.stdout.write("\t".join(fields) + "\n")
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                options.stdout.write(
                    "\t".join([table[c][r] for c in range(ncols)]) + "\n")
        else:
            for r in range(nrows):
                options.stdout.write(
                    "\t".join([table[c][r] for c in range(ncols)]) + "\n")

    E.Stop()
Exemple #30
0
def doFDR(options, args):

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    do_filter = options.fdr_qvalue is not None

    extra_headers = set()
    for data, fdr, synonyms, input_files in annotators:
        for key, value in input_files.iteritems():
            extra_headers.add(key)
    extra_headers = sorted(list(extra_headers))

    # note: id used to be file
    options.stdout.write("id\tover\tcategory\tpvalue\tfold\tobserved\texpected\tci95low\tci95high\tstddev\tfdr\tqvalue\t%s\n" %
                         "\t".join(extra_headers))

    # apply filters and create diagnostic plots
    for filename, vv in zip(args, annotators):
        data, fdr, synonyms, input_files = vv

        ninput = len(data)

        E.info("processing %s with %i data points" % (filename, ninput))
        no_fdr = False

        if options.fdr_method in ("annotator", "annotator-estimate"):
            pvalues = fdr.keys()
            pvalues.sort()
            pvalues.reverse()
            for pvalue in pvalues:
                try:
                    d = fdr[pvalue]["Significant"]
                except KeyError:
                    continue

                if d.mObserved == 0:
                    E.info("no data after fdr")
                    break

                elif d.mAverage / d.mObserved < options.fdr_qvalue:
                    E.info("filtering with P-value of %f" % pvalue)
                    if do_filter:
                        data = [x for x in data if x.mPValue < pvalue]
                    for d in data:
                        if d.mPValue < pvalue:
                            d.mFDR = 1
                            d.mQValue = options.fdr_qvalue
                    break
                else:
                    E.warn("fdr could not be computed - compute more samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

        if options.fdr_method == "estimate" or (options.fdr_method == "annotator-estimate" and no_fdr):

            E.info("estimating FDR from observed P-Values")
            pvalues = [x.mPValue for x in data]
            vlambda = numpy.arange(0, max(pvalues), 0.05)
            try:
                qvalues = Stats.doFDR(
                    pvalues, vlambda=vlambda, fdr_level=options.fdr_qvalue)
            except ValueError, msg:
                E.warn("fdr could not be computed - no output: %s" % msg)
                no_fdr = True
            else:
                for d, p, q in zip(data, qvalues.mPassed, qvalues.mQValues):
                    if p:
                        d.mFDR = 1
                    d.mQValue = q

                if do_filter:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]

        if do_filter and no_fdr:
            data = []

        nremoved = ninput - len(data)

        E.info("%s: %i data points left, %i removed" %
               (filename, len(data), nremoved))

        extra_values = []
        for key in extra_headers:
            if key in input_files:
                extra_values.append(input_files[key])
            else:
                extra_values.append("")

        extra_values = "\t".join(map(str, extra_values))

        for d in data:
            if d.mFoldChange < 1:
                code = "-"
            else:
                code = "+"

            try:
                id = re.search(options.regex_id, filename).groups()[0]
            except AttributeError:
                id = filename
            options.stdout.write("%s\t%s\t%s\t%e\t%6.4f\t%f\t%f\t%f\t%f\t%f\t%i\t%e\t%s\n" %
                                 (id,
                                  code,
                                  d.mAnnotation,
                                  d.mPValue,
                                  d.mFoldChange,
                                  d.mObserved,
                                  d.mExpected,
                                  d.mCI95[0],
                                  d.mCI95[1],
                                  d.mStdDev,
                                  d.mFDR,
                                  d.mQValue,
                                  extra_values))
Exemple #31
0
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info,
                         options):
    '''compute pairwise enrichment between sets.

    The purpose of this method is to find if there are categories that are differently enriched
    in a pair of gene lists.

    The appropriate test here is the Chi-Squared test. 

    The assumption is that the background set is the same in all gene lists.

    The workflow is thus::

       for each combination of two gene lists:
           for each GO category:
               get counts in foreground, total counts of foreground
               compute chi-square enrichment output
               save P-value
           apply fdr - output significant differences.
    '''

    dicts = [dict(x) for x in results_per_genelist]

    PairResult = collections.namedtuple("PairResult",
                                        "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description")

    outfile = getFileName(options,
                          go=test_ontology,
                          section='summary',
                          set="pairs")

    outfile.write(
        "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n")

    results = []

    total = len(dicts) * (len(dicts) - 1) / 2

    iteration = 0

    min_observed_counts = options.pairs_min_observed_counts

    for x, genelist1 in enumerate(sorted(dicts)):

        x_go_categories = set(genelist1.keys())
        for y, genelist2 in enumerate(sorted(dicts[:x])):

            iteration += 1
            if iteration % 10 == 0:
                E.info("iteration: %i/%i (%5.2f%%)" %
                       (iteration, total, 100.0 * iteration / total))

            y_go_categories = set(genelist2.keys())

            shared = x_go_categories.intersection(y_go_categories)

            c = E.Counter()

            for category in shared:
                c.shared += 1
                xx = genelist1[category]
                yy = genelist2[category]

                # discard all tests with few observations in the observed
                # counts
                if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts:
                    c.skipped += 1
                    continue

                observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory)

                aa, bb, cc, dd = \
                    (xx.mSampleCountsCategory,
                     yy.mSampleCountsCategory,
                     xx.mSampleCountsTotal - xx.mSampleCountsCategory,
                     yy.mSampleCountsTotal - yy.mSampleCountsCategory)

                if cc == dd == 0:
                    c.skipped += 1
                    continue

                c.tested += 1

                fisher, pvalue = scipy.stats.fisher_exact(numpy.array(
                    ((aa, bb),
                     (cc, dd))))

                if pvalue < 0.05:
                    c.significant_pvalue += 1
                else:
                    c.insignificant_pvalue += 1

                results.append(PairResult._make((category,
                                                 labels[x],
                                                 labels[y],
                                                 xx.mSampleCountsCategory,
                                                 xx.mSampleCountsTotal,
                                                 xx.mPValue,
                                                 xx.mQValue,
                                                 yy.mSampleCountsCategory,
                                                 yy.mSampleCountsTotal,
                                                 yy.mPValue,
                                                 yy.mQValue,
                                                 pvalue,
                                                 1.0,
                                                 go2info[category].mDescription)))

            outfile.write("\t".join(map(str,
                                        (labels[x], labels[y],
                                         len(x_go_categories),
                                         len(y_go_categories),
                                         c.shared,
                                         c.skipped,
                                         c.tested,
                                         c.significant_pvalue,
                                         c.insignicant_pvalue))) + "\n")
    if options.output_filename_pattern:
        outfile.close()

    if options.fdr:
        pvalues = [x.pvalue for x in results]

        if options.qvalue_method == "storey":

            # compute fdr via Storey's method
            try:
                fdr_data = Stats.doFDR(pvalues)

            except ValueError as msg:
                E.warn("failure in q-value computation: %s" % msg)
                E.warn("reverting to Bonferroni correction")
                method = "bonf"
                fdr_data = Stats.FDRResult()
                l = float(len(pvalues))
                fdr_data.mQValues = [min(1.0, x * l) for x in pvalues]

            qvalues = fdr_data.mQValues
        else:
            qvalues = R['p.adjust'](pvalues, method=options.qvalue_method)

        # update qvalues
        results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)]

    outfile = getFileName(options,
                          go=test_ontology,
                          section='pairs',
                          set="pairs")

    outfile.write("\t".join(PairResult._fields) + "\n")
    for result in results:
        outfile.write("\t".join(map(str, result)) + "\n")

    if options.output_filename_pattern:
        outfile.close()
Exemple #32
0
def Collect(infile,
            with_headers=False,
            annotator_format=False,
            use_annotator_fdr=False,
            delims="",
            ignore="",
            max_pvalue=1.0,
            max_qvalue=None):
    """read input table."""

    data = []

    lines = [x for x in infile.readlines() if x[0] != "#"]

    if len(lines) == 0:
        return data

    if with_headers:
        del lines[0]

    if annotator_format:

        lines = [line for line in lines if not line.startswith("Iteration")]
        annotator_fdr = {}
        annotator_level = None
        for line in lines:
            if len(line) == 1:
                continue                  # skip trailing blank lines

            if line.startswith("--"):
                if line.startswith("-- False"):
                    annotator_level = float(
                        re.search("-- False Discovery summary for p-value (.+):", line).groups()[0])
                    annotator_fdr[annotator_level] = {}
                elif line.startswith("--  Category"):
                    pass
                else:
                    if re.search("insufficiently", line):
                        continue
                    dd = re.split("\s+", line[4:-1])
                    d = DataFDR()
                    d.mObserved, d.mAverage, d.mMedian, d.m95 = list(map(
                        float, dd[1:]))
                    annotator_fdr[annotator_level][dd[0]] = d
                continue
            else:
                if line[0] == "Z":
                    continue  # skip header
                if len(line[:-1].split('\t')) != 9:
                    continue  # HACK: accounts for a bug in Annotator output

                try:
                    (z, percentchange, pvalue, observed, expected, low95,
                     up95, stddev, description) = line[:-1].split('\t')[:9]
                except ValueError:
                    raise ValueError("# parsing error in line: %s" % line[:-1])

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(pvalue)
            d.mFoldChange = 1.0 + float(percentchange) / 100.0
            data.append(d)
    else:

        for line in lines:
            try:
                (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio,
                 pover, punder, goid, category, description) = line[:-1].split("\t")[:14]
            except ValueError:
                raise ValueError("# parsing error in line: %s" % line[:-1])

            if code == "+":
                p = pover
            else:
                p = punder

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(p)
            d.mFoldChange = float(spercent) / float(bpercent)
            data.append(d)

    # apply filters
    for c in delims:
        for d in data:
            d.mAnnotation = d.mAnnotation.split(c)[0]
    for c in ignore:
        for d in data:
            d.mAnnotation = d.mAnnotation.replace(c, '')

    ninput = len(data)
    no_fdr = False
    # apply filters

    if ninput > 0:
        if max_qvalue is not None:
            if use_annotator_fdr:
                pvalues = list(annotator_fdr.keys())
                pvalues.sort()
                pvalues.reverse()
                for pvalue in pvalues:
                    try:
                        d = annotator_fdr[pvalue]["Significant"]
                    except KeyError:
                        continue
                    if d.mObserved == 0:
                        E.info("no data remaining after fdr filtering")
                        data = []
                        break
                    elif d.mAverage / d.mObserved < max_qvalue:
                        E.info("filtering with P-value of %f" % pvalue)
                        data = [x for x in data if x.mPValue < pvalue]
                        break
                else:
                    E.warn("fdr could not be computed - compute more "
                           "samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

            if no_fdr:
                if use_annotator_fdr:
                    E.info("estimating FDR from observed P-Values")

                pvalues = [x.mPValue for x in data]
                vlambda = numpy.arange(0, max(pvalues), 0.05)
                try:
                    qvalues = Stats.doFDR(
                        pvalues, vlambda=vlambda, fdr_level=max_qvalue)
                except ValueError as msg:
                    E.warn(
                        "fdr could not be computed - no filtering: %s" % msg)
                    no_fdr = True
                else:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
        elif max_pvalue is not None:
            data = [x for x in data if x.mPValue < max_pvalue]

    if no_fdr:
        data = []

    nremoved = ninput - len(data)

    return data, nremoved, no_fdr
Exemple #33
0
                else:
                    for c in options.columns:                
                        for r in range(nrows):
                            if type(table[c][r]) == types.FloatType and \
                                   table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns: pvalues.extend( table[c] )

                assert max(pvalues) <= 1.0, "pvalues > 1 in table"
                assert min(pvalues) >= 0, "pvalue < 0 in table"

                # convert to str to avoid test for float downstream
                qvalues = map(str, Stats.adjustPValues( pvalues, method = options.fdr_method ))

                x = 0
                for c in options.columns: 
                    table[c] = qvalues[x:x+nrows]
                    x += nrows

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table  = CSV.ReadTable( open(other_table_name, "r"), with_header = options.has_headers, as_rows = False )

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
Exemple #34
0
def decorator_median_length(intervals, start, end, contig, fasta):
    """compute length distribution."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return d['median'], str(d)
Exemple #35
0
                if options.mode == "pairs":
                    reference_result = last_result
                    reference_id = x - 1
                elif options.mode == "1xn":
                    reference_result = first_result
                    reference_id = 0

                if reference_result.mNumParameters >= result.mNumParameters:
                    if options.loglevel >= 1:
                        options.stdlog.write("number of parameters of full model not increased (null=%i, full=%i).\n" % (
                            reference_result.mNumParameters, result.mNumParameters))
                    continue

                lrt = Stats.doLogLikelihoodTest(
                    result.mLogLikelihood, result.mNumParameters,
                    reference_result.mLogLikelihood, reference_result.mNumParameters,
                    options.significance_threshold)

                if lrt.mPassed:
                    c = "passed"
                else:
                    c = "failed"

                options.stdout.write("%s%i\t%i\t%s\t%f\t%i\t%f\t%i\t%5.2e\n" % (prefix_row, reference_id, x, c,
                                                                                lrt.mFullLogLikelihood, lrt.mFullNumParameters,
                                                                                lrt.mNullLogLikelihood, lrt.mNullNumParameters,
                                                                                lrt.mProbability,
                                                                                ))

                last_result = result
                x += 1
Exemple #36
0
def computeFDRs(go_results,
                foreground,
                background,
                options,
                test_ontology,
                gene2go,
                go2info):

    pairs = sorted(go_results.mResults.items())

    E.info("calculating the FDRs using method `%s`" % options.qvalue_method)

    samples = None

    observed_min_pvalues = [min(x[1].mProbabilityOverRepresentation,
                                x[1].mProbabilityUnderRepresentation) for x in pairs]

    fdrs = {}

    method = options.qvalue_method

    if options.qvalue_method == "storey":

        # compute fdr via Storey's method
        try:
            fdr_data = Stats.doFDR(observed_min_pvalues)

        except ValueError as msg:
            E.warn("failure in q-value computation: %s" % msg)
            E.warn("reverting to Bonferroni correction")
            method = "bonf"
            fdr_data = Stats.FDRResult()
            l = float(len(observed_min_pvalues))
            fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues]

        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"

        #######################################################################
        # sampling
        # for each GO-category:
        # get maximum and minimum counts in x samples -> calculate minimum/maximum significance
        # get average and stdev counts in x samples -> calculate z-scores for
        # test set
        samples, simulation_min_pvalues = getSamples(gene2go,
                                                     foreground,
                                                     background,
                                                     options,
                                                     test_ontology,
                                                     go2info)

        # compute P-values from sampling
        observed_min_pvalues.sort()
        observed_min_pvalues = numpy.array(observed_min_pvalues)

        sample_size = options.sample

        for k, v in pairs:

            if k in samples:
                s = samples[k]
            else:
                raise KeyError("category %s not in samples" % k)

            # calculate values for z-score
            if s.mStddev > 0:
                zscore = abs(
                    float(v.mSampleCountsCategory) - s.mMean) / s.mStddev
            else:
                zscore = 0.0

            #############################################################
            # FDR:
            # For each p-Value p at node n:
            #   a = average number of nodes in each simulation run with P-Value < p
            #           this can be obtained from the array of all p-values and all nodes
            #           simply divided by the number of samples.
            #      aka: expfpos=experimental false positive rate
            #   b = number of nodes in observed data, that have a P-Value of less than p.
            #      aka: pos=positives in observed data
            #   fdr = a/b
            pvalue = v.mPValue

            # calculate values for FDR:
            # nfdr = number of entries with P-Value better than node.
            a = 0
            while a < len(simulation_min_pvalues) and \
                    simulation_min_pvalues[a] < pvalue:
                a += 1
            a = float(a) / float(sample_size)
            b = 0
            while b < len(observed_min_pvalues) and \
                    observed_min_pvalues[b] < pvalue:
                b += 1

            if b > 0:
                fdr = min(1.0, float(a) / float(b))
            else:
                fdr = 1.0

            fdrs[k] = (fdr, a, b)
    else:
        qvalues = R['p.adjust'](
            observed_min_pvalues, method=options.qvalue_method)
        fdr_data = Stats.FDRResult()
        fdr_data.mQValues = list(qvalues)
        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    return fdrs, samples, method
Exemple #37
0
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)",
                                                  infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (
                            data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name,
           outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
Exemple #38
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [t-test=t-test,wilcox=wilcox]",
                      choices=("t-test", "wilcox"))
    parser.add_option("-1",
                      "--infile",
                      dest="filename_input",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("--header",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")

    parser.set_defaults(
        method="t-test",
        filename_input=None,
        header="value",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_input:
        infile = open(options.filename_input, "r")
    else:
        infile = sys.stdin

    values, errors = IOTools.ReadList(infile, map_function=float)
    if options.filename_input:
        infile.close()

    if errors:
        E.warn("errors in input: %s" % ";".join(map(str, errors)))

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.filename_input2:
        infile = open(options.filename_input2, "r")
        values2, errors2 = IOTools.ReadList(infile, map_function=float)
        infile.close()
    else:
        values2 = None

    stat = Stats.Summary(values)

    power, diff_at_power95 = None, None
    if options.method == "t-test":
        if values2:
            result = R.t_test(values, values2, *xargs, **kwargs)
        else:
            result = R.t_test(values, *xargs, **kwargs)
            # compute power of test
            power = R.power_t_test(n=len(values),
                                   delta=abs(stat["mean"]),
                                   sd=stat["stddev"],
                                   sig_level=0.05)['power']
            diff_at_power95 = R.power_t_test(n=len(values),
                                             power=0.95,
                                             sd=stat["stddev"],
                                             sig_level=0.05)['delta']

    if options.method == "wilcox":
        result = R.wilcox_test(values, *xargs, **kwargs)

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key, value in sorted(result.items()):
        if key == "data.name":
            continue
        if key == "p.value":
            options.stdout.write("%s\t%5.2e\n" % (str(key), value))
        else:
            options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    for key, value in stat.items():
        options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    if power:
        options.stdout.write("1-power\t%5.2e\n" % (1.0 - power))
        options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95)

    E.Stop()
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps, 
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t, 
                    %(tablename_map)s as map, 
                    annotations.protein_stats as s,
                    annotations.transcript_info as i 
        WHERE map.snp_id = t.snp_id AND 
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) 
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s 
                   WHERE s.protein_id = i.protein_id 
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join([str(int(x < fdr))
                        for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % int(nsnps),
                              "%i" % int(ndel),
                              "%6.4f" % del_p,
                              "%6.4g" % del_pvalue,
                              "%6.4g" % del_qvalue,
                              "%6.4f" % len_p,
                              "%6.4g" % len_pvalue,
                              "%6.4g" % len_qvalue,
                              "%6.4f" % com_p,
                              "%6.4g" % com_pvalue,
                              "%6.4g" % com_qvalue,
                              )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % 0,
                              "%i" % 0,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              )) + "\n")

    outf.close()
Exemple #40
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("summary-numbers", "jalview",
                               "positive-site-table", "positive-site-list",
                               "count-positive-sites"),
                      help="methods for analysis.")

    parser.add_option("--selection-mode",
                      dest="selection_mode",
                      type="choice",
                      choices=("all", "consistent", "emes"),
                      help="how to select positive sites.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("--pattern-input-filenames",
                      dest="pattern_input_filenames",
                      type="string",
                      help="input pattern.")

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help=
        "threshold for probability above which to include positive sites [default=%default]."
    )

    parser.add_option(
        "--filter-omega",
        dest="filter_omega",
        type="float",
        help=
        "threshold for omega above which to include positive sites [default=%default]."
    )

    parser.add_option("--models",
                      dest="models",
                      type="string",
                      help="restrict output to set of site specific models.")

    parser.add_option("--analysis",
                      dest="analysis",
                      type="string",
                      help="restrict output to set of analysis [beb|neb].")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="significance threshold for log-likelihood test.")

    parser.add_option("--filter-mali",
                      dest="filter_mali",
                      type="choice",
                      choices=("none", "gaps"),
                      help="filter by mali to remove gapped positions.")

    parser.add_option(
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename with multiple alignment used for calculating sites - used for filtering"
    )

    parser.add_option(
        "--filename-map-mali",
        dest="filename_map_mali",
        type="string",
        help="filename with multiple alignment to map sites onto.")

    parser.add_option(
        "--jalview-titles",
        dest="jalview_titles",
        type="string",
        help="comma separated list of jalview annotation titles.")

    parser.add_option("--jalview-symbol",
                      dest="jalview_symbol",
                      type="string",
                      help="symbol to use in jalview.")

    parser.set_defaults(
        methods=[],
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        models="",
        analysis="",
        significance_threshold=0.05,
        selection_mode="consistent",
        filename_mali=None,
        filename_map_mali=None,
        jalview_symbol="*",
        jalview_titles="",
        filter_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.jalview_titles:
        options.jalview_titles = options.jalview_titles.split(",")
    else:
        options.jalview_titles = args

    options.models = options.models.split(",")
    options.analysis = options.analysis.split(",")

    for a in options.analysis:
        if a not in ("beb", "neb"):
            raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a

    for a in options.models:
        if a not in ("8", "2", "3"):
            raise "unknown model: '%s', possible values are 2, 3, 8" % a

    codeml = WrapperCodeML.CodeMLSites()

    ## filter and extract functions
    filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega
    extract_f = lambda x: x.mResidue

    ## read multiple results
    results = []
    ninput, noutput, nskipped = 0, 0, 0

    headers = []
    for f in args:
        ninput += 1
        try:
            results.append(codeml.parseOutput(open(f, "r").readlines()))
        except WrapperCodeML.UsageError:
            if options.loglevel >= 1:
                options.stdlog.write("# no input from %s\n" % f)
            nskipped += 1
            continue
        noutput += 1
        headers.append(f)

    ## map of nested model (key) to more general model
    map_nested_models = {'8': '7', '2': '1', '3': '0'}

    if options.filename_mali:
        mali = Mali.Mali()
        mali.readFromFile(open(options.filename_mali, "r"))
    else:
        mali = None

    ###############################################################
    ###############################################################
    ###############################################################
    ## use multiple alignment to map residues to a reference mali
    ## or a sequence.
    ###############################################################
    if options.filename_map_mali:

        if not mali:
            raise "please supply the input multiple alignment, if residues are to be mapped."

        ## translate the alignments
        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [
                    sequence[x:x + 3] for x in range(0, len(sequence), 3)
            ]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

        tmali = Mali.Mali()
        tmali.readFromFile(open(options.filename_mali, "r"))
        tmali.apply(translate)

        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile(open(options.filename_map_mali, "r"))

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply(translate)

        map_old2new = alignlib_lite.py_makeAlignmentVector()

        mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali))

        if tmap_mali.getLength() == 1:

            s = tmap_mali.values()[0].mString
            mali2 = alignlib_lite.py_makeSequence(s)
            ## see if you can find an identical subsequence and then align to thisD
            for x in tmali.values():
                if s in re.sub("[- .]+", "", x.mString):
                    mali1 = alignlib_lite.py_makeSequence(x.mString)
                    break
        else:
            mali2 = alignlib_lite.py_makeProfileFromMali(
                convertMali2Mali(tmap_mali))

        alignator = alignlib_lite.py_makeAlignatorDPFull(
            alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0)
        alignator.align(map_old2new, mali1, mali2)

        consensus = tmap_mali.getConsensus()

        if options.loglevel >= 4:
            options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet())
            options.stdlog.write("# orig  : %s\n" % tmali.getConsensus())
            options.stdlog.write("# mapped: %s\n" % consensus)
            options.stdlog.write("# alignment: %s\n" % map_old2new.Write())
    else:
        map_old2new = None

    for method in options.methods:

        if method == "summary-numbers":

            options.stdlog.write( \
"""# Numbers of positive sites.
#
# The consistent row/column contains positive sites that are significant
# (above thresholds for probability and omega) for all models/analysis
# that have been selected (label: cons).
#
# The log-likelihood ratio test is performed for model pairs, depending
# on the output chosen.
# Significance threshold: %6.4f
# The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0.
#
""" % options.significance_threshold )

            ## write header
            if options.prefix: options.stdout.write("prefix\t")

            options.stdout.write("method\tnseq\t")
            h = []
            for model in options.models:
                for analysis in options.analysis:
                    h.append("%s%s" % (analysis, model))
                h.append("p%s" % (model))
                h.append("df%s" % (model))
                h.append("chi%s" % (model))
                h.append("lrt%s" % (model))

            options.stdout.write("\t".join(h))
            options.stdout.write("\tcons\tpassed\tfilename\n")

            nmethod = 0

            consistent_cols = [None for x in range(len(options.analysis))]
            passed_tests = {}
            for m in options.models:
                passed_tests[m] = 0

            for result in results:

                row_consistent = None

                if options.prefix:
                    options.stdout.write("%s" % (options.prefix))

                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences))

                npassed = 0

                for model in options.models:

                    sites = result.mSites[model]

                    ## do significance test
                    full_model, null_model = model, map_nested_models[model]

                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood,
                        result.mSites[full_model].mNumParameters,
                        result.mSites[null_model].mLogLikelihood,
                        result.mSites[null_model].mNumParameters,
                        options.significance_threshold)

                    x = 0
                    for analysis in options.analysis:

                        if analysis == "neb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mNEB.mPositiveSites)))

                        elif analysis == "beb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mBEB.mPositiveSites)))

                        options.stdout.write("\t%i" % (len(s)))

                        if not lrt.mPassed:
                            s = set()

                        if row_consistent == None:
                            row_consistent = s
                        else:
                            row_consistent = row_consistent.intersection(s)

                        if consistent_cols[x] == None:
                            consistent_cols[x] = s
                        else:
                            consistent_cols[x] = consistent_cols[
                                x].intersection(s)

                        x += 1

                    if lrt.mPassed:
                        c = "passed"
                        passed_tests[model] += 1
                        npassed += 1
                    else:
                        c = "failed"

                    options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\
                                         (lrt.mProbability,
                                          lrt.mDegreesFreedom,
                                          lrt.mChiSquaredValue,
                                          c))

                options.stdout.write(
                    "\t%i\t%i\t%s\n" %
                    (len(row_consistent), npassed, headers[nmethod]))

                nmethod += 1

            if options.prefix:
                options.stdout.write("%s\t" % options.prefix)

            options.stdout.write("cons")

            row_consistent = None
            total_passed = 0
            for model in options.models:

                x = 0

                for analysis in options.analysis:

                    s = consistent_cols[x]
                    if s == None:
                        s = set()

                    options.stdout.write("\t%i" % (len(s)))

                    if row_consistent == None:
                        row_consistent = s
                    else:
                        row_consistent = row_consistent.intersection(s)

                    x += 1

                options.stdout.write("\tna\t%i" % passed_tests[model])
                total_passed += passed_tests[model]

            options.stdout.write("\t%i\t%i\n" %
                                 (len(row_consistent), total_passed))

        elif method == "jalview":

            options.stdout.write("JALVIEW_ANNOTATION\n")
            options.stdout.write("# Created: %s\n\n" %
                                 (time.asctime(time.localtime(time.time()))))

            l = 1
            x = 0
            for result in results:

                sites, significance = selectPositiveSites(
                    [result], options.selection_mode, options, mali)

                codes = [""] * result.mLength

                if len(sites) == 0: continue

                for site in sites:
                    codes[site - 1] = options.jalview_symbol

                options.stdout.write(
                    "NO_GRAPH\t%s\t%s\n" %
                    (options.jalview_titles[x], "|".join(codes)))
                x += 1

        elif method == "count-positive-sites":

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            options.stdout.write("%i\n" % (len(sites)))

        elif method in ("positive-site-table", ):

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            headers = ["site", "P"]
            if map_old2new:
                headers.append("mapped")
                headers.append("Pm")

            options.stdout.write("\t".join(headers) + "\n")

            sites = list(sites)
            sites.sort()
            nmapped, nunmapped = 0, 0
            for site in sites:
                values = [site, "%6.4f" % significance[site]]

                if map_old2new:
                    r = map_old2new.mapRowToCol(site)
                    if r == 0:
                        values.append("na")
                        values.append("")
                        nunmapped += 1
                        if options.loglevel >= 2:
                            options.stdlog.write("# unmapped residue: %i\n" %
                                                 site)
                    else:
                        values.append(r)
                        values.append(consensus[r - 1])
                        nmapped += 1

                options.stdout.write("\t".join(map(str, (values))) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sites: ninput=%i, noutput=%i, nskipped=%i\n" %
                    (len(sites), nmapped, nunmapped))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Exemple #41
0
def decorator_median_length(intervals, start, end, contig, fasta):
    """compute length distribution."""
    d = Stats.DistributionalParameters(map(lambda x: x[1] - x[0], intervals))
    return d['median'], str(d)
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default]."  )

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("--without-combine", dest="combine", action = "store_false",
                      help="combine overlapping motifs [default=%default]."  )

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices = ("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help ="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust","repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action = "store_true",
                      help="output stats [default=%default]."  )

    parser.add_option("--add-sequence", dest="add_sequence", action = "store_true",
                      help="add sequence information [default=%default]."  )

    parser.set_defaults(
        iterations = 100,
        qvalue_threshold = 0.05,
        motif = "rxrvdr",
        fdr_control = "all",
        combine = True,
        arrangements = None,
        mask = None,
        output_stats = False,
        add_sequence = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements == None:
        options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ]
    else:
        options.arrangements = options.arrangements.split(",")
        
    options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) )
    if options.add_sequence: options.stdout.write( "\tsequence" )
    options.stdout.write("\n")

    if options.motif == 'nr': sense_matrix = NR
    elif options.motif == "rxrvdr": sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask )
        else:
            masked_seqs = [x.sequence for x in seqs]
            
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
Exemple #43
0
                for row2 in range( row1+1, len(row_headers) ):
                    pairs.append( (row1, row2) )
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range( 0, len(row_headers) ):
                for row2 in range( 0, len(row_headers) ):
                    if row1 == row2: continue
                    pairs.append( (row1, row2) )
    
        if options.method == "chi-squared":
            
            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest( numpy.vstack( (matrix[row1], matrix[row2] ) ) )
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write( "\t".join( ( "%s" % row_header1,
                                                   "%s" % row_header2,
                                                   "%i" % result.mSampleSize,
                                                   "%i" % min(matrix.flat),
                                                   "%i" % max(matrix.flat),
                                                   options.value_format % result.mChiSquaredValue,
                                                   "%i" % result.mDegreesFreedom,
                                                   options.pvalue_format % result.mProbability,
                                                   "%s" % result.mSignificance,
                                                   options.value_format % result.mPhi ) ) + "\n" )
Exemple #44
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.")

    parser.add_option("--logscale", dest="logscale", type="string",
                      help="log-transform one or both axes [default=%Default].")

    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file [default=%default].",
                      metavar="FILE")

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="filename with table data [default=%default].",
                      metavar="FILE")

    parser.add_option("-2", "--file2", dest="input_filename2", type="string",
                      help="additional data file [default=%default].",
                      metavar="FILE")

    parser.add_option("-s", "--stats", dest="statistics", type="choice",
                      choices=("correlation", "spearman", "pearson", "count"),
                      help="statistical quantities to compute [default=%default]",
                      action="append")

    parser.add_option("-p", "--plot", dest="plot", type="choice",
                      choices=("scatter", "pairs", "panel", "bar", "bar-stacked",
                               "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal",
                               "scatter-regression"),
                      help="plots to plot [default=%default]",
                      action="append")

    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      help="min threshold to use for counting method [default=%default].")

    parser.add_option("-o", "--colours", dest="colours", type="int",
                      help="column with colour information [default=%default].")

    parser.add_option("-l", "--plot-labels", dest="labels", type="string",
                      help="column labels for x and y in matched plots [default=%default].")

    parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true",
                      help="add diagonal to plot [default=%default].")

    parser.add_option("-e", "--plot-legend", dest="legend", type="int",
                      help="column with legend [default=%default].")

    parser.add_option("-r", "--options", dest="r_options", type="string",
                      help="R plotting options [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("full", "sparse"),
                      help="output format [default=%default].")

    parser.add_option("--title", dest="title", type="string",
                      help="""plot title [default=%default].""")

    parser.add_option("", "--xrange", dest="xrange", type="string",
                      help="x viewing range of plot [default=%default].")

    parser.add_option("", "--yrange", dest="yrange", type="string",
                      help="y viewing range of plot[default=%default].")

    parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false",
                      help="do not fail on empty input [default=%default].")

    parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true",
                      help="fail on empty input [default=%default].")

    parser.set_defaults(
        hardcopy=None,
        input_filename="",
        input_filename2=None,
        columns="all",
        logscale=None,
        statistics=[],
        plot=[],
        threshold=0.0,
        labels="x,y",
        colours=None,
        diagonal=False,
        legend=None,
        title=None,
        xrange=None,
        yrange=None,
        r_options="",
        fail_on_empty=True,
        format="full")

    (options, args) = E.Start(parser)

    if len(args) == 1 and not options.input_filename:
        options.input_filename = args[0]

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.colours:
        options.colours -= 1
    if options.legend:
        options.legend -= 1

    table = {}
    headers = []

    # read data matrix
    if options.input_filename:
        lines = IOTools.openFile(options.input_filename, "r").readlines()
    else:
        # note: this will not work for interactive viewing, but
        # creating hardcopy plots works.
        lines = sys.stdin.readlines()

    lines = [x for x in lines if x[0] != "#"]

    if len(lines) == 0:
        if options.fail_on_empty:
            raise IOError("no input")
        E.warn("empty input")
        E.Stop()
        return

    matrix, headers, colours, legend = readTable(lines,
                                                 "matrix",
                                                 take_columns=options.columns,
                                                 headers=True,
                                                 colours=options.colours,
                                                 row_names=options.legend)

    if options.input_filename2:
        # read another matrix (should be of the same format.
        matrix2, headers2, colours2, legend2 = readTable(
            lines,
            "matrix2",
            take_columns=options.columns,
            headers=True,
            colours=options.colours,
            row_names=options.legend)

    R.assign("headers", headers)

    ndata = R("""length( matrix[,1] )""")[0]

    if options.loglevel >= 1:
        options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata))

    if colours:
        R.assign("colours", colours)

    for method in options.statistics:

        if method == "correlation":
            cor = R.cor(matrix, use="pairwise.complete.obs")
            writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f")

        elif method == "pearson":
            options.stdout.write("\t".join(("var1",
                                            "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "n",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    try:
                        result = R(
                            """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1))
                    except rpy.RPyException as msg:
                        E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % (
                            x, headers[x], y, headers[y], msg))
                        options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" %
                                             (headers[x], headers[y],
                                              "na",
                                              "na",
                                              "na",
                                              0,
                                              "na",
                                              "na"))

                    else:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                            (headers[x], headers[y],
                             result.rx2('estimate').rx2(
                                 'cor')[0],
                             Stats.getSignificance(
                                 float(result.rx2('p.value')[0])),
                             result.rx2('p.value')[0],
                             result.rx2('parameter').rx2(
                                 'df')[0],
                             result.rx2('method')[0],
                             result.rx2('alternative')[0]))

        elif method == "spearman":
            options.stdout.write("\t".join(("var1", "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    result = R(
                        """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1))
                    options.stdout.write(
                        "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                        (headers[x], headers[y],
                         result['estimate']['rho'],
                         Stats.getSignificance(float(result['p.value'])),
                         result['p.value'],
                         result['parameter']['df'],
                         result['method'],
                         result['alternative']))

        elif method == "count":
            # number of shared elements > threshold
            m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"),
                                             take=options.columns,
                                             headers=True)
            mask = numpy.greater(m, options.threshold)
            counts = numpy.dot(numpy.transpose(mask), mask)
            writeMatrix(options.stdout, counts, headers=c, format="%i")

    if options.plot:

        # remove columns that are completely empty
        if "pairs" in options.plot:
            colsums = R('''colSums( is.na(matrix ))''')
            take = [x for x in range(len(colsums)) if colsums[x] != ndata]
            if take:
                E.warn("removing empty columns %s before plotting" % str(take))
                matrix = R.subset(matrix, select=[x + 1 for x in take])
                R.assign("""matrix""", matrix)
                headers = [headers[x] for x in take]
                if legend:
                    legend = [headers[x] for x in take]

        if options.r_options:
            extra_options = ", %s" % options.r_options
        else:
            extra_options = ""

        if options.legend is not None and len(legend):
            extra_options += ", legend=c('%s')" % "','".join(legend)

        if options.labels:
            xlabel, ylabel = options.labels.split(",")
            extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel)
        else:
            xlabel, ylabel = "", ""

        if options.colours:
            extra_options += ", col=colours"

        if options.logscale:
            extra_options += ", log='%s'" % options.logscale

        if options.xrange:
            extra_options += ", xlim=c(%f,%f)" % tuple(
                map(float, options.xrange.split(",")))

        if options.yrange:
            extra_options += ", ylim=c(%f,%f)" % tuple(
                map(float, options.yrange.split(",")))

        if options.hardcopy:
            if options.hardcopy.endswith(".eps"):
                R.postscript(options.hardcopy)
            elif options.hardcopy.endswith(".png"):
                R.png(options.hardcopy, width=1024, height=768, type="cairo")
            elif options.hardcopy.endswith(".jpg"):
                R.jpg(options.hardcopy, width=1024, height=768, type="cairo")

        for method in options.plot:

            if ndata < 100:
                point_size = "1"
                pch = "o"
            elif ndata < 1000:
                point_size = "1"
                pch = "o"
            else:
                point_size = "0.5"
                pch = "."

            if method == "scatter":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))

            if method == "scatter-regression":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))
                dat = R(
                    """dat <- data.frame(x = matrix[,1], y = matrix[,2])""")
                R(
                    """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""")
                mod = R("""mod <- lm( y ~ x, dat)""")
                R("""predict(mod, new, se.fit = TRUE)""")
                R("""pred.w.plim <- predict(mod, new, interval="prediction")""")
                R("""pred.w.clim <- predict(mod, new, interval="confidence")""")
                R(
                    """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""")
                R.mtext(
                    "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"],
                                                        mod["coefficients"][
                                                            "(Intercept)"],
                                                        R("""cor( dat )[2]"""),
                                                        ndata),
                    3,
                    cex=1.0)

            elif method == "pairs":
                if options.add_diagonal:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); abline(0,1); }""")
                else:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); }""")

                # There used to be a argument na_action="na.omit", but
                # removed this as there appeared error messages saying
                # "na.action is not a graphical parameter" and the
                # plots showed occasionally the wrong scale.
                # cex=point_size also caused trouble (error message:
                # "X11 used font size 8 when 2 was requested" or
                # similar)
                if options.colours:
                    R.pairs(matrix,
                            pch=pch,
                            col=colours,
                            main=options.title,
                            panel="panel.hist",
                            labels=headers,
                            cex_labels=2.0)
                else:
                    R.pairs(matrix,
                            pch=pch,
                            panel="panel.hist",
                            main=options.title,
                            labels=headers,
                            cex_labels=2.0)

            elif method == "boxplot":
                extra_options += ",main='%s'" % options.title

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""boxplot( matrix %s)""" % extra_options)

            elif method == "bar" or method == "bar-stacked":
                if not options.colours:
                    extra_options += ", col=rainbow(5)"

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), %s)""" % extra_options)

            elif method == "bar-besides":
                if not options.colours:
                    extra_options += ", col=rainbow(%i)" % ndata

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" %
                  extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0, 0, 4, 0))

                R("""matrix""")
                R("""
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ;
title(main='%s');
""" % (point_size, extra_options, xlabel, ylabel))

                if options.title:
                    R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

            elif method in ("panel", "1_vs_x", "matched"):

                if method == "panel":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            pairs.append((x, y))

                elif method == "1_vs_x":
                    pairs = []
                    for x in range(1, len(headers)):
                        pairs.append((0, x))

                # print matching columns
                elif method == "matched":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            if headers[x] == headers[y]:
                                pairs.append((x, y))
                                break

                w = int(math.ceil(math.sqrt(len(pairs))))
                h = int(math.ceil(float(len(pairs)) / w))

                PosInf = 1e300000
                NegInf = -1e300000

                xlabel, ylabel = options.labels.split(",")

                R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" %
                  (w * h, w, h))
                for a, b in pairs:
                    new_matrix = [x for x in zip(
                        list(matrix[a].values())[0],
                        list(matrix[b].values())[0])
                                  if x[0] not in (float("nan"), PosInf, NegInf) and
                                  x[1] not in (float("nan"), PosInf, NegInf)]
                    try:
                        R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (
                            a + 1, b + 1, headers[b], headers[a], xlabel, ylabel))
                    except rpy.RException as msg:
                        print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg))

        if options.hardcopy:
            R['dev.off']()

    E.info("matrix added as >matrix< in R.")

    if not options.hardcopy:
        if options.input_filename:
            interpreter = code.InteractiveConsole(globals())
            interpreter.interact()
        else:
            E.info(
                "can not start new interactive session as input has come from stdin.")

    E.Stop()
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps,
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t,
                    %(tablename_map)s as map,
                    annotations.protein_stats as s,
                    annotations.transcript_info as i
        WHERE map.snp_id = t.snp_id AND
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length)
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s
                   WHERE s.protein_id = i.protein_id
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join([str(int(x < fdr))
                        for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % int(nsnps),
                              "%i" % int(ndel),
                              "%6.4f" % del_p,
                              "%6.4g" % del_pvalue,
                              "%6.4g" % del_qvalue,
                              "%6.4f" % len_p,
                              "%6.4g" % len_pvalue,
                              "%6.4g" % len_qvalue,
                              "%6.4f" % com_p,
                              "%6.4g" % com_pvalue,
                              "%6.4g" % com_qvalue,
                              )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((gene_id,
                              code,
                              "%i" % int(round(length)),
                              "%i" % 0,
                              "%i" % 0,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              "%6.4f" % 0,
                              "%6.4g" % 1,
                              "%6.4g" % 1,
                              )) + "\n")

    outf.close()
Exemple #46
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("chi-squared", "pearson-chi-squared"),
                      help="statistical methods to apply.")

    parser.add_option("-t", "--header-names", dest="headers", action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers", dest="headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix."""  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix."""  )

    parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string",
                      help="parameters for various functions.")

    parser.add_option("-a", "--iteration", dest="iteration", type="choice",
                      choices=("pairwise", "all-vs-all"),
                      help="""how to compute stats [%default]."""  )

    parser.set_defaults(
        method="chi-squared",
        headers=True,
        value_format="%6.4f",
        pvalue_format="%6.4e",
        input_format="full",
        write_separators=True,
        parameters=[],
        iteration=None,
    )

    (options, args) = E.Start(parser)

    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    chunks = [x for x in range(len(lines)) if lines[x][0] == ">"]

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    ninput, noutput, nskipped = 0, 0, 0

    if options.write_separators:
        options.stdout.write("test\t")

    header_prefix = ""

    if options.method == "chi-squared":
        header_prefix = "observed\texpected"
        options.stdout.write("\t".join(
            (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n")

    elif options.method in ("pearson-chi-squared",):
        options.stdout.write("column\t")
        options.stdout.write("\t".join(
            (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n")

        if len(options.parameters) == 0:
            raise "out of parameters - please supply probability or filename with probabilities."

        param = options.parameters[0]
        del options.parameters[0]

        if options.write_separators:
            probabilities = IOTools.ReadMap(
               IOTools.openFile(param, "r"), map_functions=(str, float))
        else:
            probability = float(param)

    for x in range(len(chunks) - 1):
        ninput += 1
        matrix, row_headers, col_headers = MatlabTools.readMatrix(
            StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])),
            format=options.input_format,
            headers=options.headers)
        nrows, ncols = matrix.shape

        if options.loglevel >= 2:
            options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" %
                                 (nrows, ncols, len(row_headers), len(col_headers)))

        if options.write_separators:
            options.stdout.write(lines[chunks[x]][1:-1] + "\t")

        pairs = []
        if options.iteration == "pairwise":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(row1 + 1, len(row_headers)):
                    pairs.append((row1, row2))
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(0, len(row_headers)):
                    if row1 == row2:
                        continue
                    pairs.append((row1, row2))

        if options.method == "chi-squared":

            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest(
                        numpy.vstack((matrix[row1], matrix[row2])))
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write("\t".join((
                    "%s" % row_header1,
                    "%s" % row_header2,
                    "%i" % result.mSampleSize,
                    "%i" % min(matrix.flat),
                    "%i" % max(matrix.flat),
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)) + "\n")

        elif options.method == "pearson-chi-squared":

            if nrows != 2:
                raise ValueError("only implemented for 2xn table")

            if options.write_separators:
                id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0]
                probability = probabilities[id]

            for col in range(ncols):
                options.stdout.write("%s\t" % col_headers[col])
                result = Stats.doPearsonChiSquaredTest(
                    probability, sum(matrix[:, col]), matrix[0, col])
                options.stdout.write("\t".join((
                    "%i" % result.mSampleSize,
                    "%f" % probability,
                    "%i" % result.mObserved,
                    "%f" % result.mExpected,
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)))
                if col < ncols - 1:
                    options.stdout.write("\n")
                    if options.write_separators:
                        options.stdout.write(lines[chunks[x]][1:-1] + "\t")

            options.stdout.write("\n")

    E.info("# ninput=%i, noutput=%i, nskipped=%i\n" %
           (ninput, noutput, nskipped))

    E.Stop()
Exemple #47
0
def decorator_stddev_score(values, start, end, contig):
    """compute stddev of values."""
    d = Stats.DistributionalParameters(values)
    return d['stddev'], str(d)
Exemple #48
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
Exemple #49
0
def decorator_percent_coverage(intervals, start, end, contig, fasta):
    """compute length of intervals."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return 100.0 * float(d['sum']) / (end - start), str(d)
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search(
            "^(\S+)_vs_(\S+)\.(\S+)", infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name, outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
Exemple #51
0
def decorator_median_score(values, start, end, contig):
    """compute median of values."""
    d = Stats.DistributionalParameters(values)
    return d['median'], str(d)
def loadGOs( infiles, outfile, tablename ):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists( indir ):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile ).groups()

        for filename in glob.glob( os.path.join(indir, "*.overall") ):
            for line in open(filename, "r" ):
                if line.startswith("#"): continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header: continue
                    tempf1.write( "track\tgeneset\tannotationset\t%s" % line )
                    header = True
                    assert data[10] == "pover" and data[11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write( "%s\t%s\t%s\t%s" % (track, geneset, annotationset, line) )
                pvalues.append( min( float(data[10]), float(data[11]) ) )

    tempf1.close()

    E.info( "analysing %i pvalues" % len(pvalues ))
    fdr = Stats.doFDR( pvalues )
    E.info( "got %i qvalues" % len(fdr.mQValues ))
    qvalues = ["global_qvalue" ] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip( open(tempf1.name,"r"), qvalues ):
        tempf2.write( "%s\t%s\n" % (line[:-1], str(qvalue)) )

    tempf2.close()
    tempfilename = tempf2.name
    print tempf1.name
    print tempf2.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty 
              --index=category 
              --index=track,geneset,annotationset
              --index=geneset
              --index=annotationset
              --index=goid 
              --table=%(tablename)s 
    < %(tempfilename)s
    > %(outfile)s
    '''
    P.run()
Exemple #53
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("--methods", dest="methods", type="choice", action="append",
                      choices=("summary-numbers", "jalview",
                               "positive-site-table", "positive-site-list",
                               "count-positive-sites"),
                      help="methods for analysis.")

    parser.add_option("--selection-mode", dest="selection_mode", type="choice",
                      choices=("all", "consistent", "emes"),
                      help="how to select positive sites.")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix for rows.")

    parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string",
                      help="input pattern.")

    parser.add_option("--filter-probability", dest="filter_probability", type="float",
                      help="threshold for probability above which to include positive sites [default=%default].")

    parser.add_option("--filter-omega", dest="filter_omega", type="float",
                      help="threshold for omega above which to include positive sites [default=%default].")

    parser.add_option("--models", dest="models", type="string",
                      help="restrict output to set of site specific models.")

    parser.add_option("--analysis", dest="analysis", type="string",
                      help="restrict output to set of analysis [beb|neb].")

    parser.add_option("--significance-threshold", dest="significance_threshold", type="float",
                      help="significance threshold for log-likelihood test.")

    parser.add_option("--filter-mali", dest="filter_mali", type="choice",
                      choices=("none", "gaps"),
                      help="filter by mali to remove gapped positions.")

    parser.add_option("--filename-mali", dest="filename_mali", type="string",
                      help="filename with multiple alignment used for calculating sites - used for filtering")

    parser.add_option("--filename-map-mali", dest="filename_map_mali", type="string",
                      help="filename with multiple alignment to map sites onto.")

    parser.add_option("--jalview-titles", dest="jalview_titles", type="string",
                      help="comma separated list of jalview annotation titles.")

    parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string",
                      help="symbol to use in jalview.")

    parser.set_defaults(
        methods=[],
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        models="",
        analysis="",
        significance_threshold=0.05,
        selection_mode="consistent",
        filename_mali=None,
        filename_map_mali=None,
        jalview_symbol="*",
        jalview_titles="",
        filter_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.jalview_titles:
        options.jalview_titles = options.jalview_titles.split(",")
    else:
        options.jalview_titles = args

    options.models = options.models.split(",")
    options.analysis = options.analysis.split(",")

    for a in options.analysis:
        if a not in ("beb", "neb"):
            raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a

    for a in options.models:
        if a not in ("8", "2", "3"):
            raise "unknown model: '%s', possible values are 2, 3, 8" % a

    codeml = WrapperCodeML.CodeMLSites()

    # filter and extract functions
    filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega
    extract_f = lambda x: x.mResidue

    # read multiple results
    results = []
    ninput, noutput, nskipped = 0, 0, 0

    headers = []
    for f in args:
        ninput += 1
        try:
            results.append(codeml.parseOutput(open(f, "r").readlines()))
        except WrapperCodeML.UsageError:
            if options.loglevel >= 1:
                options.stdlog.write("# no input from %s\n" % f)
            nskipped += 1
            continue
        noutput += 1
        headers.append(f)

    # map of nested model (key) to more general model
    map_nested_models = {'8': '7',
                         '2': '1',
                         '3': '0'}

    if options.filename_mali:
        mali = Mali.Mali()
        mali.readFromFile(open(options.filename_mali, "r"))
    else:
        mali = None

    ###############################################################
    ###############################################################
    ###############################################################
    # use multiple alignment to map residues to a reference mali
    # or a sequence.
    ###############################################################
    if options.filename_map_mali:

        if not mali:
            raise "please supply the input multiple alignment, if residues are to be mapped."

        # translate the alignments
        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

        tmali = Mali.Mali()
        tmali.readFromFile(open(options.filename_mali, "r"))
        tmali.apply(translate)

        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile(open(options.filename_map_mali, "r"))

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply(translate)

        map_old2new = alignlib_lite.py_makeAlignmentVector()

        mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali))

        if tmap_mali.getLength() == 1:

            s = tmap_mali.values()[0].mString
            mali2 = alignlib_lite.py_makeSequence(s)
            # see if you can find an identical subsequence and then align to
            # thisD
            for x in tmali.values():
                if s in re.sub("[- .]+", "", x.mString):
                    mali1 = alignlib_lite.py_makeSequence(x.mString)
                    break
        else:
            mali2 = alignlib_lite.py_makeProfileFromMali(
                convertMali2Mali(tmap_mali))

        alignator = alignlib_lite.py_makeAlignatorDPFull(
            alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0)
        alignator.align(map_old2new, mali1, mali2)

        consensus = tmap_mali.getConsensus()

        if options.loglevel >= 4:
            options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet())
            options.stdlog.write("# orig  : %s\n" % tmali.getConsensus())
            options.stdlog.write("# mapped: %s\n" % consensus)
            options.stdlog.write("# alignment: %s\n" % map_old2new.Write())
    else:
        map_old2new = None

    for method in options.methods:

        if method == "summary-numbers":

            options.stdlog.write(
                """# Numbers of positive sites.
#
# The consistent row/column contains positive sites that are significant
# (above thresholds for probability and omega) for all models/analysis
# that have been selected (label: cons).
#
# The log-likelihood ratio test is performed for model pairs, depending
# on the output chosen.
# Significance threshold: %6.4f
# The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0.
#
""" % options.significance_threshold )

            # write header
            if options.prefix:
                options.stdout.write("prefix\t")

            options.stdout.write("method\tnseq\t")
            h = []
            for model in options.models:
                for analysis in options.analysis:
                    h.append("%s%s" % (analysis, model))
                h.append("p%s" % (model))
                h.append("df%s" % (model))
                h.append("chi%s" % (model))
                h.append("lrt%s" % (model))

            options.stdout.write("\t".join(h))
            options.stdout.write("\tcons\tpassed\tfilename\n")

            nmethod = 0

            consistent_cols = [None for x in range(len(options.analysis))]
            passed_tests = {}
            for m in options.models:
                passed_tests[m] = 0

            for result in results:

                row_consistent = None

                if options.prefix:
                    options.stdout.write("%s" % (options.prefix))

                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences))

                npassed = 0

                for model in options.models:

                    sites = result.mSites[model]

                    # do significance test
                    full_model, null_model = model, map_nested_models[model]

                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood,
                        result.mSites[full_model].mNumParameters,
                        result.mSites[null_model].mLogLikelihood,
                        result.mSites[null_model].mNumParameters,
                        options.significance_threshold)

                    x = 0
                    for analysis in options.analysis:

                        if analysis == "neb":
                            s = set(
                                map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites)))

                        elif analysis == "beb":
                            s = set(
                                map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites)))

                        options.stdout.write("\t%i" % (len(s)))

                        if not lrt.mPassed:
                            s = set()

                        if row_consistent is None:
                            row_consistent = s
                        else:
                            row_consistent = row_consistent.intersection(s)

                        if consistent_cols[x] is None:
                            consistent_cols[x] = s
                        else:
                            consistent_cols[x] = consistent_cols[
                                x].intersection(s)

                        x += 1

                    if lrt.mPassed:
                        c = "passed"
                        passed_tests[model] += 1
                        npassed += 1
                    else:
                        c = "failed"

                    options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %
                                         (lrt.mProbability,
                                          lrt.mDegreesFreedom,
                                          lrt.mChiSquaredValue,
                                          c))

                options.stdout.write(
                    "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod]))

                nmethod += 1

            if options.prefix:
                options.stdout.write("%s\t" % options.prefix)

            options.stdout.write("cons")

            row_consistent = None
            total_passed = 0
            for model in options.models:

                x = 0

                for analysis in options.analysis:

                    s = consistent_cols[x]
                    if s is None:
                        s = set()

                    options.stdout.write("\t%i" % (len(s)))

                    if row_consistent is None:
                        row_consistent = s
                    else:
                        row_consistent = row_consistent.intersection(s)

                    x += 1

                options.stdout.write("\tna\t%i" % passed_tests[model])
                total_passed += passed_tests[model]

            options.stdout.write(
                "\t%i\t%i\n" % (len(row_consistent), total_passed))

        elif method == "jalview":

            options.stdout.write("JALVIEW_ANNOTATION\n")
            options.stdout.write("# Created: %s\n\n" %
                                 (time.asctime(time.localtime(time.time()))))

            l = 1
            x = 0
            for result in results:

                sites, significance = selectPositiveSites(
                    [result], options.selection_mode, options, mali)

                codes = [""] * result.mLength

                if len(sites) == 0:
                    continue

                for site in sites:
                    codes[site - 1] = options.jalview_symbol

                options.stdout.write(
                    "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes)))
                x += 1

        elif method == "count-positive-sites":

            sites, significance = selectPositiveSites(
                results, options.selection_mode, options, mali)

            options.stdout.write("%i\n" % (len(sites)))

        elif method in ("positive-site-table", ):

            sites, significance = selectPositiveSites(
                results, options.selection_mode, options, mali)

            headers = ["site", "P"]
            if map_old2new:
                headers.append("mapped")
                headers.append("Pm")

            options.stdout.write("\t".join(headers) + "\n")

            sites = list(sites)
            sites.sort()
            nmapped, nunmapped = 0, 0
            for site in sites:
                values = [site, "%6.4f" % significance[site]]

                if map_old2new:
                    r = map_old2new.mapRowToCol(site)
                    if r == 0:
                        values.append("na")
                        values.append("")
                        nunmapped += 1
                        if options.loglevel >= 2:
                            options.stdlog.write(
                                "# unmapped residue: %i\n" % site)
                    else:
                        values.append(r)
                        values.append(consensus[r - 1])
                        nmapped += 1

                options.stdout.write("\t".join(map(str, (values))) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write("# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (
                    len(sites), nmapped, nunmapped))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
def main( argv = None ):
    
    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns." )
    
    parser.add_option( "--logscale", dest="logscale", type="string",
                      help="log-transform one or both axes [default=%Default]."  )

    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file [default=%default].", 
                      metavar = "FILE" )

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="filename with table data [default=%default].",
                      metavar = "FILE")

    parser.add_option("-2", "--file2", dest="input_filename2", type="string",
                      help="additional data file [default=%default].",
                      metavar = "FILE")

    parser.add_option("-s", "--stats", dest="statistics", type="choice",
                      choices=("correlation", "spearman", "pearson", "count"),
                      help="statistical quantities to compute [default=%default]",
                      action = "append")
    
    parser.add_option("-p", "--plot", dest="plot", type="choice",
                      choices=("scatter", "pairs", "panel", "bar", "bar-stacked", 
                               "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal",
                               "scatter-regression" ),
                      help="plots to plot [default=%default]",
                      action = "append")

    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      help="min threshold to use for counting method [default=%default].")

    parser.add_option("-o", "--colours", dest="colours", type="int",
                      help="column with colour information [default=%default].")

    parser.add_option("-l", "--labels", dest="labels", type="string",
                      help="column labels for x and y in matched plots [default=%default].")

    parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true",
                      help="add diagonal to plot [default=%default].")
    
    parser.add_option("-e", "--legend", dest="legend", type="int",
                      help="column with legend [default=%default].")

    parser.add_option("-r", "--options", dest="r_options", type="string",
                      help="R plotting options [default=%default].")
    
    parser.add_option("--format", dest="format", type="choice",
                      choices=("full", "sparse"),
                      help="output format [default=%default]." )

    parser.add_option( "--title", dest="title", type="string",
                       help="""plot title [default=%default].""")

    parser.add_option("", "--xrange", dest="xrange", type="string",
                      help="x viewing range of plot [default=%default]."  )

    parser.add_option("", "--yrange", dest="yrange", type="string",
                      help="y viewing range of plot[default=%default]."  )

    parser.add_option( "--allow-empty", dest="fail_on_empty", action="store_false",
                      help="do not fail on empty input [default=%default].")

    parser.add_option( "--fail-on-empty", dest="fail_on_empty", action="store_true",
                      help="fail on empty input [default=%default].")

    parser.set_defaults( \
        hardcopy = None,
        input_filename = "",
        input_filename2 = None,
        columns = "all",
        logscale = None,
        statistics = [],
        plot=[],
        threshold=0.0,
        labels = "x,y",
        colours= None,
        diagonal = False,
        legend = None,
        title = None,
        xrange = None,
        yrange = None,        
        r_options = "",
        fail_on_empty = True,
        format = "full")

    (options, args) = E.Start( parser )

    if len(args) == 1 and not options.input_filename:
        options.input_filename = args[0]

    if options.columns not in ("all", "all-but-first"):
        options.columns = map( lambda x: int(x)-1, options.columns.split(","))
        
    if options.colours: options.colours -= 1
    if options.legend: options.legend -= 1
    
    table ={}
    headers = []

    ## read data matrix
    if options.input_filename:
        lines = open(options.input_filename, "r").readlines()
    else:
        ## note: this will not work for interactive viewing, but
        ## creating hardcopy plots works.
        lines = sys.stdin.readlines()

    lines = filter( lambda x: x[0] != "#", lines)
    
    if len(lines) == 0:
        if options.fail_on_empty:
            raise IOError ( "no input" )
        E.warn( "empty input" )
        E.Stop()
        return

    matrix, headers, colours, legend = readTable( lines,
                                                  "matrix",
                                                  take_columns = options.columns,
                                                  headers=True,
                                                  colours=options.colours,
                                                  row_names = options.legend )

    if options.input_filename2:
        ## read another matrix (should be of the same format.
        matrix2, headers2, colours2, legend2 = readTable( lines,
                                                 "matrix2",
                                                 take_columns = options.columns,
                                                 headers=True,
                                                 colours=options.colours,
                                                 row_names = options.legend )
    R.assign("headers", headers)

    ndata = R( """length( matrix[,1] )""" )[0]

    if options.loglevel >=1:
        options.stdlog.write("# read matrix: %ix%i\n" % (len(headers),ndata) )

    if colours:
        R.assign("colours", colours)

    for method in options.statistics:

        if method == "correlation":
            cor = R.cor(matrix, use="pairwise.complete.obs" )
            writeMatrix( sys.stdout, cor, headers=headers, format = "%5.2f" )

        elif method == "pearson":
            options.stdout.write( "\t".join( ("var1", 
                                              "var2",
                                              "coeff",
                                              "passed",
                                              "pvalue",
                                              "n",
                                              "method",
                                              "alternative" )) + "\n" )
            for x in range(len(headers)-1):
                for y in range( x+1, len(headers)):
                    try:
                        result = R("""cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1))
                    except rpy.RPyException, msg:
                        E.warn( "correlation not computed for columns %i(%s) and %i(%s): %s" % (x, headers[x], y, headers[y], msg) )
                        options.stdout.write( "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % \
                                                  (headers[x], headers[y],
                                                   "na",
                                                   "na",
                                                   "na",
                                                   0,
                                                   "na",
                                                   "na" ))

                    else:
                        options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % \
                                                  (headers[x], headers[y],
                                                   result.rx2('estimate').rx2('cor')[0], 
                                                   Stats.getSignificance( float(result.rx2('p.value')[0]) ),
                                                   result.rx2('p.value')[0],
                                                   result.rx2('parameter').rx2('df')[0],
                                                   result.rx2('method')[0], 
                                                   result.rx2('alternative')[0]) )

        elif method == "spearman":
            options.stdout.write( "\t".join( ("var1", "var2",
                                              "coeff",
                                              "passed",
                                              "pvalue",
                                              "method",
                                              "alternative" )) + "\n" )
            for x in range(len(headers)-1):
                for y in range( x+1, len(headers)):
                    result = R("""cor.test( matrix[,%i], matrix[,%i], method='spearman' )""" % (x + 1, y + 1))
                    options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % \
                                              (headers[x], headers[y],
                                               result['estimate']['rho'], 
                                               Stats.getSignificance( float(result['p.value']) ),
                                               result['p.value'],
                                               result['parameter']['df'],
                                               result['method'], 
                                               result['alternative']))