Exemple #1
0
def GetSubtree(tree, node_id):
    """return a copy of tree from node_id downwards."""

    subtree = Tree.Tree(weight=tree.weight, rooted=tree.rooted, name=tree.name)

    # automatically adds a root, so substitute it
    n = Bio.Nexus.Nodes.Node(tree.node(node_id).data)
    n.id = subtree.root
    subtree.chain[subtree.root] = n

    add_children(tree, subtree, node_id, subtree.root)
    return subtree
Exemple #2
0
def Graph2Tree(links, label_ancestral_nodes=False):
    """build tree from list of nodes.

    Assumption is that links always point from parent to child.
    """
    tree = Tree.Tree()

    # map of names to nodes in tree
    map_node2id = {links[0][0]: 0}
    map_id2node = [links[0][0]]

    for parent, child, branchlength in links:
        if parent not in map_node2id:
            p = len(tree.chain)
            tree.chain[p] = Bio.Nexus.Nodes.Node(Bio.Nexus.Trees.NodeData())
            map_node2id[parent] = p
            map_id2node.append(parent)
        else:
            p = map_node2id[parent]

        if child not in map_node2id:
            c = len(tree.chain)
            tree.chain[c] = Bio.Nexus.Nodes.Node(Bio.Nexus.Trees.NodeData())
            map_node2id[child] = c
            map_id2node.append(child)
        else:
            c = map_node2id[parent]

        tree.chain[p].succ.append(c)
        tree.chain[c].prev = p
        tree.chain[c].data.branchlength = branchlength

    # set taxon names for children and find root
    for i, n in list(tree.chain.items()):
        if n.prev == []:
            tree.root = i

        if n.succ == [] or label_ancestral_nodes:
            n.data.taxon = map_id2node[i]

    # set pointer to last id
    tree.id = len(list(tree.chain.items())) - 1

    return tree
Exemple #3
0
        tree=None,
        branch_scale=0,
        height_scale=0,
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        raise "please supply a species tree."

    nexus = TreeTools.Newick2Nexus(tree_lines)
    Tree.updateNexus(nexus)
    tree = nexus.trees[0]

    if options.loglevel >= 2:
        tree.display()

    plot = SVGTree(tree)

    plot.setBranchScale(options.branch_scale)
    plot.setHeightScale(options.height_scale)

    if options.colour_by_species:
        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
            NodeDecoratorBySpecies(tree, extract_species=extract_species))
Exemple #4
0
        tree=None,
        branch_scale=0,
        height_scale=0,
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        raise "please supply a species tree."

    nexus = TreeTools.Newick2Nexus(tree_lines)
    Tree.updateNexus(nexus)
    tree = nexus.trees[0]

    if options.loglevel >= 2:
        tree.display()

    plot = SVGTree(tree)

    plot.setBranchScale(options.branch_scale)
    plot.setHeightScale(options.height_scale)

    if options.colour_by_species:
        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
            NodeDecoratorBySpecies(tree, extract_species=extract_species))
Exemple #5
0
def Newick2Nexus(infile):
    """convert newick formatted tree(s) into a nexus object.

    Multiple trees are separated by a semicolon. Tree names can
    be given by fasta-style separators, i.e., lines starting with
    '>'.

    If the token [&&NHX is found in the tree, it is assumed to be
    output from njtree and support values are added. Support values are
    added in the format taxon:support:branchlength
    """
    lines = ["#NEXUS\nBegin trees;\n"]

    # build one line per tree
    if type(infile) == FileType:
        tlines = infile.readlines()
    elif type(infile) in (TupleType, ListType):
        tlines = infile
    else:
        tlines = [infile, ]

    f = []
    id = None
    ntrees = 0

    def __addLine(id, f, lines):

        if len(f) == 0:
            return
        if not id:
            id = "tree%i" % ntrees
        id = re.sub("=", "_", id)

        s = "".join(f)[:-1]
        if s.find("[&&NHX") >= 0:
            # process njtree trees with bootstrap values
            fragments = []
            l = 0
            for x in re.finditer("(:[-0-9.]+\[&&NHX[^\]]*\])", s):
                fragments.append(s[l:x.start()])
                frag = s[x.start():x.end()]
                bl = ":%s" % re.search(":([-0-9.]+)", frag).groups()[0]

                rx = re.search("B=([-0-9.]+)", frag)
                if rx:
                    support = ":%s" % rx.groups()[0]
                else:
                    support = ""

                fragments.append("%s%s" % (support, bl))

                l = x.end()

            fragments.append(s[l:len(s)])

            s = "".join(fragments)
            s = re.sub("\[&&NHX[^\]]*\]", "", s)

        lines.append("tree '%s' = %s;\n" % (id, s))

    for line in tlines:
        line = line.strip()
        if not line:
            continue
        if line[0] == "#":
            continue
        if line[0] == ">":
            id = line[1:]
            continue

        line = re.sub("\s", "", line).strip()
        f.append(line)
        if line[-1] == ";":
            __addLine(id, f, lines)
            f, id = [], None
            ntrees += 1

    # treat special case of trees without trailing semicolon
    __addLine(id, f, lines)
    lines.append("End;")

    # previoulsy, a string was ok, now a string
    # is interpreted as a filename.
    nexus = Bio.Nexus.Nexus.Nexus(StringIO.StringIO("".join(lines)))

    if len(nexus.trees) == 0:
        raise ValueError("no tree found in file %s" % str(infile))

    # remove starting/ending ' from name
    for tree in nexus.trees:
        tree.name = tree.name[1:-1]

    Tree.updateNexus(nexus)

    return nexus
Exemple #6
0
def Newick2Nexus(infile):
    """convert newick formatted tree(s) into a nexus object.

    Multiple trees are separated by a semicolon. Tree names can
    be given by fasta-style separators, i.e., lines starting with
    '>'.

    If the token [&&NHX is found in the tree, it is assumed to be
    output from njtree and support values are added. Support values are
    added in the format taxon:support:branchlength

    Arguments
    ---------
    infile : object
        Input data. Can be a file, a list of lines or a single line.

    Returns
    -------
    nexus : Bio.Nexus.Nexus

    """
    lines = ["#NEXUS\nBegin trees;\n"]

    if isinstance(infile, str):
        tlines = [infile]
    else:
        tlines = [x for x in infile]

    f = []
    id = None
    ntrees = 0

    def __addLine(id, f, lines):

        if len(f) == 0:
            return
        if not id:
            id = "tree%i" % ntrees
        id = re.sub("=", "_", id)

        s = "".join(f)[:-1]
        if s.find("[&&NHX") >= 0:
            # process njtree trees with bootstrap values
            fragments = []
            l = 0
            for x in re.finditer("(:[-0-9.]+\[&&NHX[^\]]*\])", s):
                fragments.append(s[l:x.start()])
                frag = s[x.start():x.end()]
                bl = ":%s" % re.search(":([-0-9.]+)", frag).groups()[0]

                rx = re.search("B=([-0-9.]+)", frag)
                if rx:
                    support = ":%s" % rx.groups()[0]
                else:
                    support = ""

                fragments.append("%s%s" % (support, bl))

                l = x.end()

            fragments.append(s[l:len(s)])

            s = "".join(fragments)
            s = re.sub("\[&&NHX[^\]]*\]", "", s)

        lines.append("tree '%s' = %s;\n" % (id, s))

    for line in tlines:
        line = line.strip()
        if not line:
            continue
        if line[0] == "#":
            continue
        if line[0] == ">":
            id = line[1:]
            continue

        line = re.sub("\s", "", line).strip()
        f.append(line)
        if line[-1] == ";":
            __addLine(id, f, lines)
            f, id = [], None
            ntrees += 1

    # treat special case of trees without trailing semicolon
    __addLine(id, f, lines)
    lines.append("End;")

    # previoulsy, a string was ok, now a string
    # is interpreted as a filename.
    nexus = Bio.Nexus.Nexus.Nexus(StringIO("".join(lines)))

    if len(nexus.trees) == 0:
        raise ValueError("no tree found in file %s" % str(infile))

    # remove starting/ending ' from name
    for tree in nexus.trees:
        tree.name = tree.name[1:-1]

    Tree.updateNexus(nexus)

    return nexus
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-r", "--species-regex", dest="species_regex", type="string",
                      help="regular expression to extractspecies from identifier.")

    parser.add_option("--gene-regex", dest="gene_regex", type="string",
                      help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives", dest="filename_filter_positives", type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s", "--filename-species-tree", dest="filename_species_tree", type="string",
                      help="filename with species tree.")

    parser.add_option("--filename-species2colour", dest="filename_species2colour", type="string",
                      help="filename with map of species to colours. If not given, random colours are assigned to species.")

    parser.add_option("-t", "--species-tree", dest="species_tree", type="string",
                      help="species tree.")

    parser.add_option("-e", "--locations-tsv-file", dest="filename_locations", type="string",
                      help="filename with map of transcript information to location information.")

    parser.add_option("--no-create", dest="create", action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option("--max-separation", dest="max_separation", type="int",
                      help="maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough).")

    parser.add_option("--filename-species2url", dest="filename_species2url", type="string",
                      help="filename with mapping information of species to URL.")

    parser.add_option("--column-prefix", dest="prefix", type="string",
                      help="prefix to add as first column.")

    parser.add_option("--outgroup-species", dest="outgroup_species", type="string",
                      help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees", dest="subtrees_trees", action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers", dest="subtrees_identifiers", action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids", dest="svg_add_ids", action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus", dest="svg_otus", type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts", dest="svg_branch_lengths", type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals", dest="print_totals", action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals", dest="print_subtotals", action="store_true",
                      help="output subtotals sections.")

    parser.add_option("--print-best", dest="print_best", action="store_true",
                      help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg", dest="print_svg", action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg", dest="print_species_svg", action="store_true",
                      help="output species svg files.")

    parser.add_option("--output-filename-pattern", dest="output_pattern", type="string",
                      help="""output pattern for separate output of sections [default: %default].
                      Set to None, if output to stdout. Can contain one %s to be substituted with section.""")

    parser.add_option("--output-pattern-svg", dest="output_pattern_svg", type="string",
                      help="filename for svg output. If it contains %s, this is replaced by gene_tree name.")

    parser.add_option("--filename-node-types", dest="filename_node_types", type="string",
                      help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data", dest="analyze_resolution_data", type="choice", action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality", dest="filter_quality", type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location", dest="filter_location", type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced", dest="remove_unplaced", action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups", dest="skip_without_outgroups", action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={"Speciation": 0,
                   "SpeciationDeletion": 1,
                  "Transcripts": 2,
                   "DuplicationLineage": 3,
                  "Duplication": 4,
                  "DuplicationDeletion": 5,
                  "DuplicationInconsistency": 6,
                  "Outparalogs": 7,
                  "InconsistentTranscripts": 8,
                  "Inconsistency": 9,
                  "Masked": 10},
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(
        parser, add_database_options=True, add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(open(options.filename_locations, "r"),
                                                           extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all") and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees", "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# read %i gene trees from stdin.\n" % len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" % gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n" % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" % gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" % gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(distance_to_root,
                                                                 gene_tree,
                                                                 gene_set,
                                                                 options,
                                                                 extract_species,
                                                                 method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n" % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write("# tree %s: small distance %s to root at node %i: %s\n" %
                                     (gene_tree.name, options.format_branch_length % height,
                                      node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write("# tree %s: reference_height=%s\n" % (
                gene_tree.name, options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree,
                        options, prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(
            total_heights_per_species, heights_per_species)
        TreeReconciliation.appendCounts(
            total_relheights_per_species, relheights_per_species)

        TreeReconciliation.appendCounts(
            total_heights_per_tree, heights_per_tree)
        TreeReconciliation.appendCounts(
            total_relheights_per_tree, relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree,
                    options, prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n" % (
            ninput, nfiltered, nskipped, nskipped_filter, nskipped_outgroups, noutput))

    E.Stop()
Exemple #8
0
    #########################################################################                    
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees", "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write( "# deleting file %s.\n" % fn )
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus( sys.stdin )

    Tree.updateNexus( gene_nexus )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i gene trees from stdin.\n" % len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################            
    ## main loop over gene trees
    #########################################################################                    
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0
    
    ## total counts
    total_heights_per_species = {}
Exemple #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()