Example #1
0
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(("ER%i" % x,
                       re.compile(motif + "." * x + reverse_motif,
                                  re.IGNORECASE)))

    db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(IOTools.openFile(controlfile, "r"),
                                           motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n"
    )
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * ncontrol / (
                ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % \
                    (motif,
                     db_counts[motif],
                     control_counts[motif],
                     db_seqcounts[motif],
                     IOTools.prettyPercent( db_seqcounts[motif], ndb),
                     control_seqcounts[motif],
                     IOTools.prettyPercent( control_seqcounts[motif], ncontrol),
                     fold) )
Example #2
0
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(
            ("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE)))

    db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(
        IOTools.openFile(controlfile, "r"), motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n")
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * \
                ncontrol / (ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write("%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" %
                   (motif,
                    db_counts[motif],
                    control_counts[motif],
                    db_seqcounts[motif],
                    IOTools.prettyPercent(db_seqcounts[motif], ndb),
                    control_seqcounts[motif],
                    IOTools.prettyPercent(control_seqcounts[motif], ncontrol),
                    fold))
Example #3
0
def summary(infile, outfile):
    """compute mapping stats."""

    def _getfiles(filename):

        track = outfile[: -len(".mapped.summary")]
        if track.endswith(".merged"):
            xtrack = track[: -len(".merged")]
            finput = "%s.psl.gz" % xtrack
            fmerged = "%s.transcripts.transcripts.psl" % xtrack
            fmapped = "%s.mapped.psl" % track
        else:
            finput = "%s.psl.gz" % track
            fmerged = finput
            fmapped = "%s.mapped.psl" % track
        return track, finput, fmerged, fmapped

    outf = open(outfile, "w")
    outf.write("track\tinput\tmerged\tpmerged\tmapped\tpmapped\tpoutput\n")

    def countPSL(filename):
        if filename.endswith(".gz"):
            i = gzip.open(filename)
        else:
            i = open(filename)
        ll = [x[:10] for x in i.readlines() if not x.startswith("#")]
        if ll[0].startswith("psLayout"):
            return len(ll) - 5
        else:
            return len(ll)

    track, finput, fmerged, fmapped = _getfiles(outfile)
    ninput = countPSL(finput)
    # subtract header
    nmerged = countPSL(fmerged) - 5
    nmapped = countPSL(fmapped)

    outf.write(
        "%s\t%i\t%i\t%s\t%i\t%s\t%s\n"
        % (
            track,
            ninput,
            nmerged,
            IOTools.prettyPercent(nmerged, ninput),
            nmapped,
            IOTools.prettyPercent(nmapped, nmerged),
            IOTools.prettyPercent(nmapped, ninput),
        )
    )
Example #4
0
 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)
Example #5
0
 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)
Example #6
0
File: diff_gtf.py Project: SCV/cgat
    def __str__(self):

        return "\t".join(map(str, (
            self.mGenes1, self.mGenes2,
            self.mGenesOverlapping1, self.mGenesOverlapping2,
            self.mGenesUnique1, self.mGenesUnique2,
            self.mExons1, self.mExons2,
            self.mExonsOverlapping1, self.mExonsOverlapping2,
            self.mExonsUnique1, self.mExonsUnique2,
            self.mBases1, self.mBases2,
            self.mBasesOverlapping1, self.mBasesOverlapping2,
            self.mBasesUnique1, self.mBasesUnique2 ) ) ) + "\t" +\
            "\t".join(map(lambda x: IOTools.prettyPercent(*x), (
                (self.mGenesOverlapping1, self.mGenes1),
                (self.mGenesOverlapping2, self.mGenes2),
                (self.mGenesUnique1, self.mGenes1),
                (self.mGenesUnique2, self.mGenes2),
                (self.mExonsOverlapping1, self.mExons1),
                (self.mExonsOverlapping2, self.mExons2),
                (self.mExonsUnique1, self.mExons1),
                (self.mExonsUnique2, self.mExons2),
                (self.mBasesOverlapping1, self.mBases1),
                (self.mBasesOverlapping2, self.mBases2),
                (self.mBasesUnique1, self.mBases1),
                (self.mBasesUnique2, self.mBases2))))
Example #7
0
    def __str__(self):

        return "\t".join( map( str, (
                        self.mGenes1, self.mGenes2,
                        self.mGenesOverlapping1, self.mGenesOverlapping2,
                        self.mGenesUnique1, self.mGenesUnique2,
                        self.mExons1, self.mExons2,
                        self.mExonsOverlapping1, self.mExonsOverlapping2,
                        self.mExonsUnique1, self.mExonsUnique2,
                        self.mBases1, self.mBases2,
                        self.mBasesOverlapping1, self.mBasesOverlapping2,
                        self.mBasesUnique1, self.mBasesUnique2 ) ) ) + "\t" +\
              "\t".join( map( lambda x: IOTools.prettyPercent( *x), (
                    (self.mGenesOverlapping1, self.mGenes1),
                    (self.mGenesOverlapping2, self.mGenes2),
                    (self.mGenesUnique1, self.mGenes1),
                    (self.mGenesUnique2, self.mGenes2),
                    (self.mExonsOverlapping1, self.mExons1),
                    (self.mExonsOverlapping2, self.mExons2),
                    (self.mExonsUnique1, self.mExons1),
                    (self.mExonsUnique2, self.mExons2),
                    (self.mBasesOverlapping1, self.mBases1),
                    (self.mBasesOverlapping2, self.mBases2),
                    (self.mBasesUnique1, self.mBases1),
                    (self.mBasesUnique2, self.mBases2)  ) ) )
Example #8
0
def summary(infile, outfile):
    '''compute mapping stats.'''

    def _getfiles(filename):

        track = outfile[:-len(".mapped.summary")]
        if track.endswith(".merged"):
            xtrack = track[:-len(".merged")]
            finput = "%s.psl.gz" % xtrack
            fmerged = "%s.transcripts.transcripts.psl" % xtrack
            fmapped = "%s.mapped.psl" % track
        else:
            finput = "%s.psl.gz" % track
            fmerged = finput
            fmapped = "%s.mapped.psl" % track
        return track, finput, fmerged, fmapped

    outf = open(outfile, "w")
    outf.write("track\tinput\tmerged\tpmerged\tmapped\tpmapped\tpoutput\n")

    def countPSL(filename):
        if filename.endswith(".gz"):
            i = gzip.open(filename)
        else:
            i = open(filename)
        ll = [x[:10] for x in i.readlines() if not x.startswith("#")]
        if ll[0].startswith("psLayout"):
            return len(ll) - 5
        else:
            return len(ll)

    track, finput, fmerged, fmapped = _getfiles(outfile)
    ninput = countPSL(finput)
    # subtract header
    nmerged = countPSL(fmerged) - 5
    nmapped = countPSL(fmapped)

    outf.write("%s\t%i\t%i\t%s\t%i\t%s\t%s\n" %
               (track,
                ninput,
                nmerged,
                IOTools.prettyPercent(nmerged, ninput),
                nmapped,
                IOTools.prettyPercent(nmapped, nmerged),
                IOTools.prettyPercent(nmapped, ninput)))
Example #9
0
 def _write(outs, text, numerator, denominator, base):
     percent = IOTools.prettyPercent(numerator, denominator)
     outs.write('%s\t%i\t%s\t%s\n' % (text,
                                      numerator,
                                      percent,
                                      base))
Example #10
0
File: runGO.py Project: yangjl/cgat
def main():

    parser = E.OptionParser( version = "%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $", usage = globals()["__doc__"])

    dbhandle = Database.Database()
    
    parser.add_option("-s", "--species", dest="species", type="string",
                      help="species to use [default=%default]." )

    parser.add_option("-i", "--slims", dest="filename_slims", type="string",
                      help="filename with GO SLIM categories [default=%default].")

    parser.add_option( "-g", "--genes", dest="filename_genes", type="string",
                       help="filename with genes to analyse [default=%default]." )

    parser.add_option( "-b", "--background", dest="filename_background", type="string",
                       help="filename with background genes to analyse [default=%default]." )

    parser.add_option( "-m", "--minimum-counts", dest="minimum_counts", type="int",
                       help="minimum count - ignore all categories that have fewer than # number of genes"
                            " [default=%default]." )

    parser.add_option( "-o", "--sort-order", dest="sort_order", type="choice",
                       choices=("fdr", "pvalue", "ratio" ),
                       help="output sort order [default=%default]." )

    parser.add_option( "--ontology", dest="ontology", type="string", action="append",
                       help="go ontologies to analyze. Ontologies are tested separately."
                       " [default=%default]." )

    parser.add_option( "-t", "--threshold", dest="threshold", type="float",
                       help="significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values." )

    parser.add_option ("--filename-dump", dest="filename_dump", type="string",
                       help="dump GO category assignments into a flatfile [default=%default]." )

    parser.add_option ("--filename-gene2name", dest="filename_gene2name", type="string",
                       help="optional filename mapping gene identifiers to gene names [default=%default]." )

    parser.add_option ("--filename-ontology", dest="filename_ontology", type="string",
                       help="filename with ontology in OBO format [default=%default]." )

    parser.add_option ( "--filename-input", dest="filename_input", type="string",
                       help="read GO category assignments from a flatfile [default=%default]." )

    parser.add_option ( "--sample-size", dest="sample", type="int",
                        help="do sampling (with # samples) [default=%default]." )

    parser.add_option ( "--filename-output-pattern", "--output-filename-pattern", 
                        dest = "output_filename_pattern", type="string",
                        help="pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option ( "--fdr", dest="fdr", action="store_true",
                        help="calculate and filter by FDR [ReadGene2GOFromFiledefault=%default]." )
    
    parser.add_option ( "--go2goslim", dest="go2goslim", action="store_true",
                        help="convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]." )

    parser.add_option ( "--gene-pattern", dest = "gene_pattern", type="string",
                        help="pattern to transform identifiers to GO gene names [default=%default].")
    
    parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string",
                       help="write mapping between GO categories and GOSlims [default=%default].")

    parser.add_option( "--get-genes", dest="get_genes", type="string",
                       help="list all genes in the with a certain GOID [default=%default]." )

    parser.add_option( "--strict", dest="strict", action="store_true",
                       help="require all genes in foreground to be part of background. "
                       "If not set, genes in foreground will be added to the background [default=%default]." )

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices = ( "empirical", "storey", "BH" ),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )


    parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true",
                       help="compute pairwise enrichment for multiple gene lists. "
                       "[default=%default]." )

    # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    #                    help="fdr computation: method for estimating pi0 [default=%default]."  )
    
    parser.set_defaults( species = None,
                         filename_genes = "-",
                         filename_background = None,
                         filename_slims = None,
                         minimum_counts = 0,
                         ontology = [],
                         filename_dump = None,
                         sample = 0,
                         fdr = False,
                         output_filename_pattern = None,
                         threshold = 0.05,
                         filename_map_slims = None,
                         gene_pattern = None,
                         sort_order = "ratio",
                         get_genes = None,
                         strict = False,
                         qvalue_method = "empirical",
                         pairs_min_observed_counts = 3,
                         compute_pairwise = False,
                         filename_gene2name = None
                         )

    (options, args) = E.Start( parser, add_mysql_options = True )

    if options.go2goslim:
        convertGo2Goslim( options )
        E.Stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn( "fdr will be computed without sampling" )
        
    #############################################################
    ## dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = ["biol_process", "mol_function", "cell_location"] 

        E.info( "dumping GO categories to %s" % (options.filename_dump) )

        dbhandle.Connect( options )
            
        outfile = IOTools.openFile( options.filename_dump, "w", create_dir = True )
        DumpGOFromDatabase( outfile,
                            dbhandle,
                            options )
        outfile.close()
        E.Stop()
        sys.exit(0)

    #############################################################
    ## read GO categories from file
    if options.filename_input:
        E.info( "reading association of categories and genes from %s" % (options.filename_input) )
        infile = IOTools.openFile(options.filename_input)
        gene2gos, go2infos = ReadGene2GOFromFile( infile )
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name)
        infile = IOTools.openFile( options.filename_gene2name)
        gene2name = IOTools.readMap( infile, has_header = True )
        infile.close()
        E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())),
                                                               len(gene2name)))
    else:
        gene2name = None

    #############################################################
    ## read GO ontology from file
    if options.filename_ontology:
        E.info( "reading ontology from %s" % (options.filename_ontology) )
        
        infile = IOTools.openFile(options.filename_ontology)
        ontology = readOntology( infile )
        infile.close()
        
        def _g():
            return collections.defaultdict( GOInfo )
        go2infos = collections.defaultdict( _g );

        ## substitute go2infos
        for go in ontology.values():
            go2infos[go.mNameSpace][go.mId] = GOInfo( go.mId,
                                                      go_type = go.mNameSpace,
                                                      description = go.mName )

    #############################################################
    ## get foreground gene list
    input_foreground, genelists = ReadGeneLists( options.filename_genes, 
                                                 gene_pattern = options.gene_pattern )

    E.info( "read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists)) )

    #############################################################
    ## get background
    if options.filename_background:
        
        # nick - bug fix: background is the first tuple element from ReadGeneLists
        input_background = ReadGeneLists( options.filename_background, 
                                          gene_pattern = options.gene_pattern )[0]
        E.info( "read %i genes for background" % len(input_background) )
    else:
        input_background = None

    #############################################################
    ## sort out which ontologies to test
    if not options.ontology: 
        if options.filename_input:
            options.ontology = gene2gos.keys()

    E.info( "found %i ontologies: %s" % (len(options.ontology), options.ontology))

    summary = []
    summary.append( "\t".join( (
                "genelist",
                "ontology",
                "significant",
                "threshold",
                "ngenes",
                "ncategories",
                "nmaps",
                "nforegound",
                "nforeground_mapped",
                "nbackground",
                "nbackground_mapped",
                "nsample_counts",
                "nbackground_counts",
                "psample_assignments",
                "pbackground_assignments") ) + "\n" )


    #############################################################
    ## get go categories for genes
    for test_ontology in options.ontology:

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info( "working on ontology %s" % test_ontology )
        #############################################################
        ## get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info( "reading data from database ..." )

            dbhandle.Connect( options )
            gene2go, go2info = ReadGene2GOFromDatabase( dbhandle,
                                                        test_ontology,
                                                        options.database, options.species )

            E.info( "finished" )

        if len(go2info) == 0:
            E.warn( "could not find information for terms - could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )        
        E.info( "assignments found: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) )

        if options.minimum_counts > 0:
            to_remove = set([ x for x,y in counts_per_category.iteritems() if y < options.minimum_counts ])
            E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts ) )
            removeCategories( gene2go, to_remove )

            ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )        
            E.info( "assignments after filtering: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) )

        for genelist_name, foreground in genelists.iteritems():

            msgs = []
            E.info("processing %s with %i genes" % (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            ## build background - reconcile with foreground
            ##################################################################
            if input_background == None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple. 
            # background is the first tuple element 
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn( "%i genes in foreground that are not in background - added to background of %i" %\
                                (len(missing), len(background)) )

                background.extend(missing)

            E.info( "(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background)))

            #############################################################
            ## sanity checks:            
            ## are all of the foreground genes in the dataset
            ## missing = set(genes).difference( set(gene2go.keys()) )
            ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################            
            ## read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GetGOSlims( IOTools.openFile(options.filename_slims, "r") )

                if options.loglevel >=1:
                    v = set()
                    for x in go_slims.values():
                        for xx in x: v.add(xx)
                    options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\
                                              ( options.filename_slims,
                                                len(go_slims), 
                                                len( v ) ))



                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile=IOTools.openFile(options.filename_map_slims, "w" )

                    outfile.write( "GO\tGOSlim\n" )
                    for go, go_slim in go_slims.items():
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = MapGO2Slims( gene2go, go_slims, ontology = ontology )

                if options.loglevel >=1:
                    ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )
                    options.stdlog.write( "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n" % (ngenes, ncategories, nmaps) )

            #############################################################
            ## Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in gene2go.items():
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append( gene )
                            elif gene in background:
                                bg.append( gene )
                            else:
                                ng.append( gene )

                ## skip to next GO class
                if not (bg or ng): continue

                options.stdout.write( "# genes in GO category %s\n" % options.get_genes )
                options.stdout.write( "gene\tset\n" )
                for x in fg: options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in bg: options.stdout.write("%s\t%s\n" % ("bg", x))           
                for x in ng: options.stdout.write("%s\t%s\n" % ("ng", x))                       

                E.info( "nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng) ))

                E.Stop()
                sys.exit(0)

            #############################################################
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'foreground',
                                   set = genelist_name )

            outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( foreground) ) ) )
            if options.output_filename_pattern:
                outfile.close()

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'background',
                                   set = genelist_name )

            outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( background[0]) ) ) )
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            ## do the analysis
            go_results = AnalyseGO( gene2go, foreground, background )

            if len(go_results.mSampleGenes) == 0:
                E.warn( "%s: no genes with GO categories - analysis aborted" % genelist_name)
                continue

            pairs = go_results.mResults.items()

            #############################################################
            ## calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method  = computeFDRs( go_results, 
                                                      foreground,
                                                      background,
                                                      options, 
                                                      test_ontology,
                                                      gene2go,
                                                      go2info)
                for x,v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None
                
            msgs.append( "fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort( lambda x, y: cmp(x[1].mQValue, y[1].mQValue))           
            elif options.sort_order == "ratio":
                pairs.sort( lambda x, y: cmp(x[1].mRatio, y[1].mRatio))
            elif options.sort_order == "pvalue":
                pairs.sort( lambda x, y: cmp(x[1].mPValue, y[1].mPValue))

            #############################################################
            #############################################################
            #############################################################
            ## output the full result
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'overall',
                                   set = genelist_name )

            outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples )

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = selectSignificantResults( pairs, fdrs, options )

            nselected = len(filtered_pairs)
            nselected_up = len( [x for x in filtered_pairs if x[1].mRatio > 1 ] )
            nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1 ] )
            
            assert nselected_up + nselected_down == nselected

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'results',
                                   set = genelist_name )

            outputResults( outfile, 
                           filtered_pairs, 
                           go2info, 
                           options,
                           fdrs = fdrs, 
                           samples = samples )
            
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append( pairs )
            all_significant_results.append( filtered_pairs )
            all_genelists_with_results.append( genelist_name )

            #############################################################
            #############################################################
            #############################################################
            ## output parameters
            ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'parameters',
                                   set = genelist_name )

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology ))
            outfile.write( "parameter\tvalue\tdescription\n" )
            outfile.write( "mapped_genes\t%i\tmapped genes\n" % ngenes )
            outfile.write( "mapped_categories\t%i\tmapped categories\n" % ncategories )
            outfile.write( "mappings\t%i\tmappings\n" % nmaps )
            outfile.write( "genes_in_fg\t%i\tgenes in foreground\n" % len(foreground) )
            outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes)) )
            outfile.write( "genes_in_bg\t%i\tinput background\n" % nbackground )
            outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes)) )
            outfile.write( "associations_in_fg\t%i\tassociations in sample\n"     % go_results.mSampleCountsTotal )
            outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal )
            outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" )))
            outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" )))
            outfile.write( "significant\t%i\tsignificant results reported\n" % nselected )
            outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up )
            outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down )
            outfile.write( "threshold\t%6.4f\tsignificance threshold\n" % options.threshold )        

            if options.output_filename_pattern:
                outfile.close()

            summary.append( "\t".join( map(str, ( \
                                                genelist_name,
                                                  test_ontology,
                                                  nselected,
                                                  options.threshold,
                                                  ngenes,
                                                  ncategories,
                                                  nmaps,
                                                  len(foreground),
                                                  len(go_results.mSampleGenes),
                                                  nbackground,
                                                  len(go_results.mBackgroundGenes),
                                                  go_results.mSampleCountsTotal,
                                                  go_results.mBackgroundCountsTotal,
                                                  IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ),
                                                  IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ),
                                                  ",".join( msgs) ) ) ) + "\n" )

            #############################################################
            #############################################################
            #############################################################
            ## output the fg patterns
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'withgenes',
                                   set = genelist_name )

            outputResults( outfile, pairs, go2info, options, 
                           fdrs = fdrs, 
                           samples = samples,
                           gene2go = gene2go,
                           foreground = foreground,
                           gene2name = gene2name )
            
            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ######################################################################
            ######################################################################
            ######################################################################
            ## output various summary files
            ## significant results
            outputMultipleGeneListResults( all_significant_results, 
                                           all_genelists_with_results, 
                                           test_ontology, 
                                           go2info,
                                           options,
                                           section = 'significant')

            ## all results
            outputMultipleGeneListResults( all_results, 
                                           all_genelists_with_results, 
                                           test_ontology, 
                                           go2info,
                                           options,
                                           section = 'all')

            
            if options.compute_pairwise:
                pairwiseGOEnrichment( all_results,
                                      all_genelists_with_results,
                                      test_ontology,
                                      go2info,
                                      options )

    outfile_summary = options.stdout
    outfile_summary.write( "".join( summary) )



    E.Stop()
Example #11
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option( "-m", "--output-mismatches", dest="output_mismatches", action = "store_true",
                       help = "output mismatches [%default]" )

    parser.add_option( "-a", "--output-matches", dest="output_matches", action = "store_true",
                       help = "output matches [%default]" )

    parser.add_option( "-u", "--output-unique", dest="output_unique", action = "store_true",
                       help = "output unique positions [%default]" )

    parser.add_option( "-r", "--restrict", dest="restrict", type = "string",
                       help = "restrict analysis to a chromosome pair (chr1:chr1:+) [%default]" )

    parser.set_defaults(
        output_mismatches = False,
        output_unique = False,
        restrict = None 
        )


    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if len(args) != 2:
        raise ValueError( "expected two chain files" )

    filename_chain1, filename_chain2 = args

    E.info( "validating chain 1")
    if not validateChain( IOTools.openFile( filename_chain1 ) ):
        E.warn( "validation failed - exiting" )
        return 1
        
    E.info( "validating chain 2")
    if not validateChain( IOTools.openFile( filename_chain2 ) ):
        E.warn( "validation failed - exiting" )
        return 1

    E.info( "building pairs for %s" % filename_chain1 )
    pairs1 = buildPairs( IOTools.openFile( filename_chain1 ) )
    E.info( "read %i pairs" % len(pairs1) )

    E.info( "building pairs for %s" % filename_chain2 )
    pairs2 = buildPairs( IOTools.openFile( filename_chain2 ) )
    E.info( "read %i pairs" % len(pairs2) )

    if options.restrict: 
        restrict = tuple(options.restrict.split(":"))
        pairs1 = { restrict: pairs1[restrict] }
        pairs2 = { restrict: pairs2[restrict] }

    E.info( "comparing 1 -> 2")
    comparison1 = compareChains( pairs1, pairs2 )
    E.info( "comparing 2 -> 1")
    comparison2 = compareChains( pairs2, pairs1 )

    all_keys = sorted(list( set(comparison1.keys() + comparison2.keys())))
    
    outfile = options.stdout
    headers = ("mapped", "identical", "different", "unique")
    outfile.write( "contig1\tcontig2\tstrand\t%s\t%s\t%s\t%s\n" %\
                       ( 
            "\t".join( ["%s1" % x for x in headers ] ),
            "\t".join( ["p%s1" % x for x in headers ] ),
            "\t".join( ["%s2" % x for x in headers ] ),
            "\t".join( ["p%s2" % x for x in headers ] )))
                         
    totals = E.Counter()

    for key in all_keys:
        outfile.write( "%s\t%s\t%s" % key )
        
        if key in comparison1:
            c = comparison1[key]
            outfile.write( "\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique ) )
            outfile.write( "\t".join( [ IOTools.prettyPercent( x, c.total ) for x in c ] ) )

            totals.total1 += c.total
            totals.same1 += c.same
            totals.different1 += c.different
            totals.unique1 += c.unique
        else:
            outfile.write( "\t%i\t%i\t%i\t%i\t" % (0,0,0,0) )
            outfile.write( "\t%i\t%i\t%i\t%i" % (0,0,0,0) )

        if key in comparison2:
            c = comparison2[key] 
            outfile.write( "\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique ) )
            outfile.write( "\t".join( [ IOTools.prettyPercent( x, c.total ) for x in c ] ) )

            totals.same2 += c.same
            totals.total2 += c.total
            totals.different2 += c.different
            totals.unique2 += c.unique 
        else:
            outfile.write( "\t%i\t%i\t%i\t%i\t" % (0,0,0,0) )
            outfile.write( "\t%i\t%i\t%i\t%i" % (0,0,0,0) )

        outfile.write("\n")

    outfile.write( "total\ttotal\t.\t" )
    outfile.write( "\t".join( map(str, ( totals.total1,
                                         totals.same1, 
                                         totals.different1,
                                         totals.unique1,
                                         IOTools.prettyPercent( totals.total1, totals.total1 ),
                                         IOTools.prettyPercent( totals.same1, totals.total1 ),
                                         IOTools.prettyPercent( totals.different1, totals.total1 ),
                                         IOTools.prettyPercent( totals.unique1, totals.total1 ),
                                         totals.total2,
                                         totals.same2, 
                                         totals.different2,
                                         totals.unique2,
                                         IOTools.prettyPercent( totals.total2, totals.total2 ),
                                         IOTools.prettyPercent( totals.same2, totals.total2 ),
                                         IOTools.prettyPercent( totals.different2, totals.total2 ),
                                         IOTools.prettyPercent( totals.unique2, totals.total2 ),
                                         ) ) ) + "\n" )
                                        
    
    # output mismapped residues
    if options.output_mismatches or options.output_unique:
        outputMismatches( pairs1, pairs2,
                          output_mismatches = options.output_mismatches,
                          output_unique = options.output_unique,
                          output_matches = options.output_matches,
                          )

    ## write footer and output benchmark information.
    E.Stop()
Example #12
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use [default=%default].")

    parser.add_option("-i",
                      "--slims",
                      dest="filename_slims",
                      type="string",
                      help="filename with GO SLIM categories "
                      "[default=%default].")

    parser.add_option("-g",
                      "--genes-tsv-file",
                      dest="filename_genes",
                      type="string",
                      help="filename with genes to analyse "
                      "[default=%default].")

    parser.add_option("-b",
                      "--background-tsv-file",
                      dest="filename_background",
                      type="string",
                      help="filename with background genes to analyse "
                      "[default=%default].")

    parser.add_option("-m",
                      "--min-counts",
                      dest="minimum_counts",
                      type="int",
                      help="minimum count - ignore all categories that have "
                      "fewer than # number of genes"
                      " [default=%default].")

    parser.add_option("-o",
                      "--sort-order",
                      dest="sort_order",
                      type="choice",
                      choices=("fdr", "pvalue", "ratio"),
                      help="output sort order [default=%default].")

    parser.add_option("--ontology",
                      dest="ontology",
                      type="string",
                      action="append",
                      help="go ontologies to analyze. Ontologies are tested "
                      "separately [default=%default].")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option("--filename-dump",
                      dest="filename_dump",
                      type="string",
                      help="dump GO category assignments into a flatfile "
                      "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file",
        dest="filename_gene2name",
        type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology",
        dest="filename_ontology",
        type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option("--filename-input",
                      dest="filename_input",
                      type="string",
                      help="read GO category assignments from a flatfile "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample",
                      type="int",
                      help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option("--fdr",
                      dest="fdr",
                      action="store_true",
                      help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim",
        dest="go2goslim",
        action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option("--gene-pattern",
                      dest="gene_pattern",
                      type="string",
                      help="pattern to transform identifiers to GO gene names "
                      "[default=%default].")

    parser.add_option("--filename-map-slims",
                      dest="filename_map_slims",
                      type="string",
                      help="write mapping between GO categories and GOSlims "
                      "[default=%default].")

    parser.add_option(
        "--get-genes",
        dest="get_genes",
        type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict",
        dest="strict",
        action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q",
        "--fdr-method",
        dest="qvalue_method",
        type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise",
        dest="compute_pairwise",
        action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None)

    (options, args) = E.Start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.Stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"
            ]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = connectToEnsembl(options)

        outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True)
        GO.DumpGOFromDatabase(outfile, dbhandle, options)
        outfile.close()
        E.Stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = IOTools.openFile(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = IOTools.openFile(options.filename_gene2name)
        gene2name = IOTools.readMap(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())), len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = IOTools.openFile(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)

        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId,
                                                        go_type=go.mNameSpace,
                                                        description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes, gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background, gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join(
        ("genelist", "ontology", "significant", "threshold", "ngenes",
         "ncategories", "nmaps", "nforegound", "nforeground_mapped",
         "nbackground", "nbackground_mapped", "nsample_counts",
         "nbackground_counts", "psample_assignments",
         "pbackground_assignments", "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle, test_ontology, options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn("could not find information for terms - "
                   "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" % (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set([
                x for x, y in counts_per_category.items()
                if y < options.minimum_counts
            ])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    IOTools.openFile(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims, len(go_slims), len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = IOTools.openFile(options.filename_map_slims,
                                                   "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" %
                        (ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write("# genes in GO category %s\n" %
                                     options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.Stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results, foreground,
                                                       background, options,
                                                       test_ontology, gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write("mapped_categories\t%i\tmapped categories\n" %
                          ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n"
                % (len(go_results.mSampleGenes)))
            outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n"
                % (len(go_results.mBackgroundGenes)))
            outfile.write("associations_in_fg\t%i\tassociations in sample\n" %
                          go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mSampleGenes),
                                         len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mBackgroundGenes),
                                         nbackground, "%5.2f")))
            outfile.write("significant\t%i\tsignificant results reported\n" %
                          nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n"
                % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n"
                % nselected_down)
            outfile.write("threshold\t%6.4f\tsignificance threshold\n" %
                          options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(
                map(str, (genelist_name, test_ontology, nselected,
                          options.threshold, ngenes, ncategories, nmaps,
                          len(foreground), len(go_results.mSampleGenes),
                          nbackground, len(go_results.mBackgroundGenes),
                          go_results.mSampleCountsTotal,
                          go_results.mBackgroundCountsTotal,
                          IOTools.prettyPercent(len(go_results.mSampleGenes),
                                                len(foreground), "%5.2f"),
                          IOTools.prettyPercent(
                              len(go_results.mBackgroundGenes), nbackground,
                              "%5.2f"), ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology, go2info, options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.Stop()
Example #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--output-mismatches",
                      dest="output_mismatches",
                      action="store_true",
                      help="output mismatches [%default]")

    parser.add_option("-a",
                      "--output-matches",
                      dest="output_matches",
                      action="store_true",
                      help="output matches [%default]")

    parser.add_option("-u",
                      "--output-unique",
                      dest="output_unique",
                      action="store_true",
                      help="output unique positions [%default]")

    parser.add_option(
        "-r",
        "--restrict",
        dest="restrict",
        type="string",
        help="restrict analysis to a chromosome pair (chr1:chr1:+) [%default]")

    parser.set_defaults(output_mismatches=False,
                        output_unique=False,
                        restrict=None)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("expected two chain files")

    filename_chain1, filename_chain2 = args

    E.info("validating chain 1")
    if not validateChain(IOTools.openFile(filename_chain1)):
        E.warn("validation failed - exiting")
        return 1

    E.info("validating chain 2")
    if not validateChain(IOTools.openFile(filename_chain2)):
        E.warn("validation failed - exiting")
        return 1

    E.info("building pairs for %s" % filename_chain1)
    pairs1 = buildPairs(IOTools.openFile(filename_chain1))
    E.info("read %i pairs" % len(pairs1))

    E.info("building pairs for %s" % filename_chain2)
    pairs2 = buildPairs(IOTools.openFile(filename_chain2))
    E.info("read %i pairs" % len(pairs2))

    if options.restrict:
        restrict = tuple(options.restrict.split(":"))
        pairs1 = {restrict: pairs1[restrict]}
        pairs2 = {restrict: pairs2[restrict]}

    E.info("comparing 1 -> 2")
    comparison1 = compareChains(pairs1, pairs2)
    E.info("comparing 2 -> 1")
    comparison2 = compareChains(pairs2, pairs1)

    all_keys = sorted(list(set(comparison1.keys() + comparison2.keys())))

    outfile = options.stdout
    headers = ("mapped", "identical", "different", "unique")
    outfile.write( "contig1\tcontig2\tstrand\t%s\t%s\t%s\t%s\n" %\
                       (
            "\t".join( ["%s1" % x for x in headers ] ),
            "\t".join( ["p%s1" % x for x in headers ] ),
            "\t".join( ["%s2" % x for x in headers ] ),
            "\t".join( ["p%s2" % x for x in headers ] )))

    totals = E.Counter()

    for key in all_keys:
        outfile.write("%s\t%s\t%s" % key)

        if key in comparison1:
            c = comparison1[key]
            outfile.write("\t%i\t%i\t%i\t%i\t" %
                          (c.total, c.same, c.different, c.unique))
            outfile.write("\t".join(
                [IOTools.prettyPercent(x, c.total) for x in c]))

            totals.total1 += c.total
            totals.same1 += c.same
            totals.different1 += c.different
            totals.unique1 += c.unique
        else:
            outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0))
            outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0))

        if key in comparison2:
            c = comparison2[key]
            outfile.write("\t%i\t%i\t%i\t%i\t" %
                          (c.total, c.same, c.different, c.unique))
            outfile.write("\t".join(
                [IOTools.prettyPercent(x, c.total) for x in c]))

            totals.same2 += c.same
            totals.total2 += c.total
            totals.different2 += c.different
            totals.unique2 += c.unique
        else:
            outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0))
            outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0))

        outfile.write("\n")

    outfile.write("total\ttotal\t.\t")
    outfile.write("\t".join(
        map(str, (
            totals.total1,
            totals.same1,
            totals.different1,
            totals.unique1,
            IOTools.prettyPercent(totals.total1, totals.total1),
            IOTools.prettyPercent(totals.same1, totals.total1),
            IOTools.prettyPercent(totals.different1, totals.total1),
            IOTools.prettyPercent(totals.unique1, totals.total1),
            totals.total2,
            totals.same2,
            totals.different2,
            totals.unique2,
            IOTools.prettyPercent(totals.total2, totals.total2),
            IOTools.prettyPercent(totals.same2, totals.total2),
            IOTools.prettyPercent(totals.different2, totals.total2),
            IOTools.prettyPercent(totals.unique2, totals.total2),
        ))) + "\n")

    # output mismapped residues
    if options.output_mismatches or options.output_unique:
        outputMismatches(
            pairs1,
            pairs2,
            output_mismatches=options.output_mismatches,
            output_unique=options.output_unique,
            output_matches=options.output_matches,
        )

    ## write footer and output benchmark information.
    E.Stop()
Example #14
0
 def _write(outs, text, numerator, denominator, base):
     percent = IOTools.prettyPercent(numerator, denominator)
     outs.write('%s\t%i\t%s\t%s\n' % (text,
                                      numerator,
                                      percent,
                                      base))