Example #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort_order",
                      type="string",
                      help="output order of OTU.")

    parser.set_defaults(
        reference_tree=None,
        sort_order=[],
    )

    (options, args) = E.Start(parser)

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus(options.reference_tree)
    reference_tree = nexus.trees[0]

    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order)

    for p in patterns:
        print p

    E.Stop()
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string",
                      help="reference tree to read.")

    parser.add_option("-s", "--sort-order", dest="sort_order", type="string",
                      help="output order of OTU.")

    parser.set_defaults(
        reference_tree=None,
        sort_order=[],
    )

    (options, args) = E.Start(parser)

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus(options.reference_tree)
    reference_tree = nexus.trees[0]

    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order)

    for p in patterns:
        print p

    E.Stop()
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("map", "links", "trees"),
                      help="output format.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.set_defaults(
        reference_tree=None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        separator="|",
        filename_summary=None,
    )

    (options, args) = E.Start(parser)

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":

        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append(id)

    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus(sys.stdin)

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">":
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1])
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue

            data = line[:-1].split("\t")[:2]
            members.add(data[0])
            members.add(data[1])

        if id: clusters[id] = members

    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union(set(map(extract_species, members)))

    if reference_tree:

        TreeTools.PruneTree(reference_tree, species_set)

        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" %
                                 len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append(x)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree(
            reference_tree, options.column2org)

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [SpeciesCounts() for x in options.column2org]

    ## first genes, then transcripts
    options.stdout.write(
        "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" %
        ("\t".join(options.column2org), "\t".join(options.column2org)))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [{} for x in range(len(options.org2column))]
        count_transcripts = [0] * len(options.org2column)

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s

            col = options.org2column[s]

            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0

            count_genes[col][g] += 1

            species_counts[col].mGenes.add(g)
            species_counts[col].mTranscripts.add(t)
            species_counts[col].mTrees.add(cluster)

        ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts)
        npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts))
        ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes))
        npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes)))

        pattern = GetPattern(count_transcripts, notus)
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write(
            string.join(
                (cluster, pattern, str(npresent_genes), str(ntotal_genes),
                 string.join(map(str, map(len, count_genes)), "\t"),
                 str(ntotal_transcripts),
                 string.join(map(str, count_transcripts), "\t")), "\t") + "\n")

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")

    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok))
        else:
            outfile.write("%s\t%s\n" % (x, patterns[x]))

    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout

    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write(
            "%s\t%i\t%i\t%i\n" %
            (species, len(species_counts[col].mTranscripts),
             len(species_counts[col].mGenes), len(species_counts[col].mTrees)))

    if outfile != sys.stdout: outfile.close()

    E.Stop()
Example #4
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", 
                             usage = globals()["__doc__"])
    
    parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string",
                      help="reference tree to read."  )

    parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string",
                      help="filename with patterns to output."  )

    parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string",
                      help="filename with summary to output."  )

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("map", "links", "trees"),
                      help="output format."  )

    parser.add_option( "-o", "--organisms", dest="column2org", type="string" ,
                       help="sorted list of organisms.")

    parser.add_option( "-s", "--species-regex", dest="species_regex", type="string" ,
                       help="regular expression to extract species from identifier.")

    parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string" ,
                       help="regular expression to extract gene from identifier.")

    

    parser.set_defaults(
        reference_tree = None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex ="^([^|]+)\|",
        gene_regex = "^[^|]+\|[^|]+\|([^|]+)\|",
        separator = "|",
        filename_summary = None,
        )

    (options, args) = E.Start( parser )

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus( options.reference_tree )
        else:
            nexus = TreeTools.Newick2Nexus( open(options.reference_tree, "r") )
        reference_tree = nexus.trees[0]
    
        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":
        
        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append( id )
            
    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus( sys.stdin )

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">" :
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1] )
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue
            
            data = line[:-1].split("\t")[:2]
            members.add( data[0] )
            members.add( data[1] )
            
        if id: clusters[id] = members
        
    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################                
    rs = re.compile( options.species_regex )
    rg = re.compile( options.gene_regex )
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union( set(map( extract_species, members) ) )

    if reference_tree:

        TreeTools.PruneTree( reference_tree, species_set )
    
        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append( reference_tree.node(nx).get_data().taxon )
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append( x )            

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x
    
    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org )

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [ SpeciesCounts() for x in options.column2org ]
        
    ## first genes, then transcripts
    options.stdout.write("mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org) , "\t".join(options.column2org) ))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [ {} for x in range(len(options.org2column)) ] 
        count_transcripts = [0] * len(options.org2column)                

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s
            
            col = options.org2column[s]
            
            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0
            
            count_genes[col][g] += 1

            species_counts[col].mGenes.add( g )
            species_counts[col].mTranscripts.add( t )
            species_counts[col].mTrees.add( cluster )            

        ntotal_transcripts    = reduce( lambda x,y: x+y, count_transcripts)
        npresent_transcripts  = len(filter( lambda x: x > 0, count_transcripts))
        ntotal_genes          = reduce( lambda x,y: x+y, map(len, count_genes))
        npresent_genes        = len(filter( lambda x: x > 0, map(len,count_genes)))
                    
        pattern = GetPattern( count_transcripts, notus )
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write( string.join( (cluster, pattern, str(npresent_genes),
                                            str(ntotal_genes), 
                                            string.join( map(str, map(len, count_genes)), "\t"),
                                            str(ntotal_transcripts), 
                                            string.join( map(str, count_transcripts), "\t")), "\t") + "\n")
        

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################            
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")
        
    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write( "%s\t%s\t%s\n" % (x, patterns[x], is_ok) )
        else:
            outfile.write( "%s\t%s\n" % (x, patterns[x]) )            
    
    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################            
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout
        
    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write("%s\t%i\t%i\t%i\n" % ( species,
                                             len(species_counts[col].mTranscripts),
                                             len(species_counts[col].mGenes),
                                             len(species_counts[col].mTrees) ))
        
    if outfile != sys.stdout: outfile.close()
    
    E.Stop()
Example #5
0
        sort_order = [],
        )

    (options, args) = E.Start( parser )

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append( reference_tree.node(nx).get_data().taxon )
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus( options.reference_tree )
    reference_tree = nexus.trees[0]
    
    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree( tree, options.sort_order )

    for p in patterns:
        print p

    E.Stop()