Ejemplo n.º 1
0
def rerootTree(gene_tree, extract_species, options):

    otus = TreeTools.GetTaxa(gene_tree)

    # find monophyletic trees of outgroup_species
    try:
        outgroup_taxa = filter(
            lambda x: extract_species(x) in options.outgroup_species, otus)
    except AttributeError:
        raise "error while rerooting tree in tree %s with %s" % (
            gene_tree.name, str(otus))

    if gene_tree.is_monophyletic(outgroup_taxa):
        r = outgroup_taxa
    else:
        r = [outgroup_taxa[0], ]

    if r:
        if options.loglevel >= 1:
            options.stdlog.write("# tree %s: rerooting with %i outgroups:  %s.\n" % (
                gene_tree.name, len(r), ",".join(r)))
            options.stdlog.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: no outgroup found, tree will not be rerooted.\n" % gene_tree.name)
            options.stdlog.flush()

    gene_tree.root_with_outgroup(r)

    if options.loglevel >= 5:
        gene_tree.display()
Ejemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--skip-trees",
        dest="skip_trees",
        action="store_true",
        help="do not output tree names in third field [default=%default].")

    parser.set_defaults(skip_trees=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ntree = 0
    ntotal = len(nexus.trees)

    if ntotal == 1:
        options.stdout.write("taxon\n")
    else:
        if options.skip_trees:
            options.stdout.write("taxon\ttree\n")
        else:
            options.stdout.write("taxon\ttree\tname\n")

    for tree in nexus.trees:
        ntree += 1
        taxa = TreeTools.GetTaxa(tree)

        if ntotal == 1:
            for t in taxa:
                options.stdout.write("%s\n" % (t))
        elif options.skip_trees:
            for t in taxa:
                options.stdout.write("%s\t%i\n" % (t, ntree))
        else:
            for t in taxa:
                options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name))

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i\n" % (ntotal))

    E.Stop()
Ejemplo n.º 3
0
def filterTree(tree, options, map_id2location=None):
    """apply location and type filter to tree.

    if outgroups are defined, they are not removed.
    """

    otus = TreeTools.GetTaxa(tree)

    to_remove = set()
    if options.remove_unplaced:
        tt = set()
        for id in otus:
            if id not in map_id2location:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: unknown location for id %s.\n" % id)
                continue

            if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK:
                to_remove.add(id)
                tt.add(id)

        if options.loglevel >= 3:
            options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" %
                                 (tree.name, len(tt), ";".join(tt)))

    new_otus = list(set(otus).difference(to_remove))

    if len(new_otus) != len(otus):

        TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True)

    if options.loglevel >= 1:
        options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" %
                             (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree))))
        options.stdlog.flush()
Ejemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method to use [counts|lists|hists|links].")

    parser.add_option("-o",
                      "--filename-output",
                      dest="filename_output",
                      type="string",
                      help="output filename.")

    parser.add_option("-f",
                      "--functions",
                      dest="functions",
                      type="string",
                      help="functions to grep [functional|pseudo|all].")

    parser.add_option("-l",
                      "--locations",
                      dest="locations",
                      type="string",
                      help="locations to grep [local|nojunk|all|...].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("-i",
                      "--fit",
                      dest="fit",
                      type="string",
                      help="fitting method [decay|power]")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--use-relative-height",
                      dest="use_relative_height",
                      action="store_true",
                      help="use relative height values.")

    parser.add_option(
        "--reverse",
        dest="reverse",
        action="store_true",
        help="""reverse species. Histograms will show the age of duplications for
                      duplicates in other genomes.""")

    parser.set_defaults(species="",
                        functions="functional,pseudo,all",
                        locations="local,nojunk,all",
                        filename_output=None,
                        bin_size=1.0,
                        min_value=None,
                        max_value=None,
                        nonnull=None,
                        use_relative_height=False,
                        header=True,
                        fit=None,
                        reverse=False,
                        method="counts")

    (options, args) = E.Start(parser, add_psql_options=True)

    options.species = options.species.split(",")
    options.locations = options.locations.split(",")
    options.functions = options.functions.split(",")

    if len(options.species) == 0:
        raise "please supply list of species."

    dbhandle = pgdb.connect(options.psql_connection)

    input_data = map(lambda x: x[:-1].split("\t"),
                     filter(lambda x: x[0] != "#", sys.stdin.readlines()))

    ## remove header
    if options.header:
        del input_data[0]

    ## decide which columns to take
    ## 1st column: species1: this is the species in which duplications have occured.
    ## 2nd column: species2: this is the species with respect to which duplications occured.
    ## 3rd column: clusterid
    ## 4th column: chromosomes
    ## 5th column: function
    ## 6th column: height
    ## 7th column: relative height
    ## 8th column: locations
    ## 9th column: tree
    if options.use_relative_height:
        take = (0, 1, 2, 3, 4, 6, 7, 8)
    else:
        take = (0, 1, 2, 3, 4, 5, 7, 8)

    for x in range(len(input_data)):
        input_data[x] = tuple([input_data[x][y] for y in take])

    map_pos2species = []
    map_species2pos = {}
    for x in range(len(options.species)):
        map_species2pos[options.species[x]] = x
        map_pos2species.append(options.species[x])

    outfile = None

    if options.method in ("counts", "medians"):

        if options.method == "counts":
            func = len
        elif options.method == "medians":
            func = numpy.median

        for location in options.locations:

            for function in options.functions:
                matrix = numpy.zeros(
                    (len(options.species), len(options.species)), numpy.Float)

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None
                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        if len(values) > 0:
                            matrix[map_species2pos[last_species1],
                                   map_species2pos[last_species2]] = func(
                                       values)

                        values = []
                        last_species1 = species1
                        last_species2 = species2

                    values.append(float(height))

                if len(values) > 0:
                    matrix[map_species2pos[last_species1],
                           map_species2pos[last_species2]] = func(values)

                if options.filename_output:
                    dict = {"f": function, "l": location}
                    outfile = open(options.filename_output % dict, "w")
                else:
                    outfile = sys.stdout
                    outfile.write(
                        "matrix for method %s: location: %s, function: %s\n" %
                        (options.method, location, function))

                if options.method == "medians":
                    format = "%6.4f"
                elif options.method == "counts":
                    format = "%i"
                MatlabTools.WriteMatrix(matrix,
                                        outfile=outfile,
                                        format=format,
                                        row_headers=options.species,
                                        col_headers=options.species)

                if options.filename_output:
                    outfile.close()

    elif options.method in ("lists", "lists-union"):
        ## write lists of duplicated genes in species1 as compared to species2
        ##      according to location/function
        ## First field : gene name
        ## Second field: cluster id
        ## Third field : number of other genes in cluster
        ## Fourth field: location of gene
        written = {}
        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None

                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if options.method == "lists":
                                if outfile: outfile.close()
                                dict = {
                                    "f": function,
                                    "l": location,
                                    "s": species1,
                                    "o": species2
                                }
                                written = {}
                                outfile = open(options.filename_output % dict,
                                               "w")
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    if outfile: outfile.close()
                                    dict = {
                                        "f": function,
                                        "l": location,
                                        "s": species1
                                    }
                                    written = {}
                                    outfile = open(
                                        options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            if options.method == "lists":
                                outfile.write(
                                    "location: %s, function: %s, species1: %s, species2: %s\n"
                                    % (location, function, species1, species2))
                                written = {}
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    outfile.write(
                                        "location: %s, function: %s, species1: %s\n"
                                        % (location, function, species1))
                                    written = {}

                        last_species1 = species1
                        last_species2 = species2

                    # get tree
                    tt = TreeTools.Newick2Tree(tree)
                    taxa = TreeTools.GetTaxa(tt)
                    for t in taxa:
                        if t in written: continue
                        outfile.write("%s\t%s\t%i\n" %
                                      (t, cluster_id, len(taxa)))
                        written[t] = 1

    elif options.method in ("hists", "fit-decay"):

        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                data.sort()

                ################################################################
                ## convert to matrix of list
                ## values[x][y] contains heights of duplications in species x with reference to y

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    try:
                        values[map_species2pos[species1]][
                            map_species2pos[species2]].append(float(height))
                    except KeyError:
                        continue

                ################################################################
                ################################################################
                ################################################################
                # calculate histograms per species
                ################################################################
                for s in options.species:
                    histograms = []
                    headers = []

                    if options.filename_output:
                        dict = {"f": function, "l": location, "s": s}
                        outfile = open(options.filename_output % dict, "w")
                    else:
                        outfile = sys.stdout
                        outfile.write("location: %s, function: %s\n" %
                                      (location, function))

                    for x in range(len(options.species)):

                        if options.reverse:
                            ## duplications in species x
                            vv = values[x][map_species2pos[s]]
                        else:
                            ## duplications in species s
                            vv = values[map_species2pos[s]][x]

                        if len(vv) == 0:
                            pass
                        else:
                            headers.append(options.species[x])
                            h = Histogram.Calculate(
                                vv,
                                increment=options.bin_size,
                                min_value=options.min_value,
                                max_value=options.max_value,
                                no_empty_bins=True)

                            if options.method == "fit-decay":
                                result = fit(h, [2.0, -1.0])
                                if result:
                                    outfile.write(
                                        "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n"
                                        % (
                                            "function",
                                            s,
                                            options.species[x],
                                            h[0][1],
                                            result[0],
                                            result[1],
                                            result[0],
                                            result[1],
                                        ))
                            elif options.method == "hists":
                                histograms.append(h)

                    if options.method == "hists":
                        combined_histogram = Histogram.Combine(
                            histograms, missing_value="-")

                        outfile.write("bin\t" + "\t".join(headers) + "\n")
                        Histogram.Write(outfile, combined_histogram)

                    if options.filename_output:
                        outfile.close()
                    else:
                        outfile.flush()

    elif options.method == "pairs":

        ## get branches with 0 branchlength

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                data.sort()
                last_species1, last_species2, last_cluster_id = None, None, None

                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if outfile: outfile.close()
                            dict = {
                                "f": function,
                                "l": location,
                                "s": species1,
                                "o": species2
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, species1, species2))

                        last_species1 = species1
                        last_species2 = species2
                        last_cluster_id = None

                    if last_cluster_id != cluster_id:
                        if last_cluster_id != None:
                            pass

                        last_cluster_id = cluster_id

                    outfile.write("%s\t%s\t%s\t%s\n" %
                                  (cluster_id, height, locations, tree))

    elif options.method == "links":

        ## write a tree for each species pair:
        ## each node is a gene+location, the weight of the vertex is the height
        ## further info added: cluster_id for the duplication

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                ## stores duplications within first species as compared to second species
                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    values[map_species2pos[species1]][
                        map_species2pos[species2]].append(
                            (cluster_id, -len(locations), locations, tree))

                # get links per species
                for s in options.species:
                    if options.loglevel >= 2:
                        options.stdlog.write("#     processing species %s\n" %
                                             s)

                    headers = []
                    for x in range(len(options.species)):

                        if map_pos2species[x] == s: continue

                        vv = values[map_species2pos[s]][x]
                        vv.sort()

                        ## write trees per cluster
                        if options.filename_output:
                            dict = {
                                "f": function,
                                "l": location,
                                "s": s,
                                "o": map_pos2species[x]
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, s, map_pos2species[x]))

                        ## only print out largest tree
                        last_cluster_id = None
                        for cluster_id, n, locations, tree in vv:
                            if cluster_id != last_cluster_id:
                                outfile.write("%s\t%s\t%s\n" %
                                              (cluster_id, locations, tree))
                                last_cluster_id = cluster_id

                        if options.filename_output:
                            outfile.close()

    E.Stop()
Ejemplo n.º 5
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string",
                      help="filename of map to output."  )

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("filter", "split"),
                      help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default"  )

    parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string",
                      help="filename pattern for output multiple alignment files."  )

    parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float",
                      help="remove terminal branches with a branch length larger than this."  )

    parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-min-length", dest="filter_min_length", type="float",
                      help="remove terminal branches with a branch length smaller than this."  )

    parser.add_option("--filter-max-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append",
                      help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order."  )

    parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string",
                      help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list."  )

    parser.add_option("--min-support", dest="min_support", type="float",
                      help="for monophyly filtering, only accept trees with minimum support."  )

    parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", 
                      help="filter by number of taxa."  )

    parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", 
                      help="filter for trees for simple orhtologs. This works by counting the number of taxa."  )

    parser.add_option("--filter", dest="filter", type="choice",
                      choices=("taxa", "trees"),
                      help="filter removes taxa or whole trees." )

    parser.set_defaults(
        output_pattern="%s.tree",
        output_filename_map = None,
        filter_terminal_max_length = None,
        filter_terminal_min_length = None,
        filter_max_length = None,
        filter_min_length = None,
        method ="split",
        filter = "taxa",
        filtered_branch_length = -999,
        filter_by_trees = [],
        filter_by_monophyly = None,
        filter_ntaxa = None,
        filter_simple_orthologs = None,
        min_support = 0.0,
        regex_species = ("^([^|]+)" ),
        )

    (options, args) = E.Start( parser )

    nexus = TreeTools.Newick2Nexus( sys.stdin )
    
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees))

    ninput, noutput, nskipped = 0, 0, 0
    ndiscarded = 0
    ndiscarded_taxa = 0
    ndiscarded_branches = 0

    extract_species = lambda x: re.search( options.regex_species, x).groups()[0]
    
    if options.filter_by_trees:
        nexus_filter = []
        nexus_maps = []
        for filename in options.filter_by_trees:
            nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) )
            trees = nexus_filter[-1].trees
            if options.loglevel >=1 :
                options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename))

            nexus_map = {}
            for x in range( len(trees)):
                nexus_map[trees[x].name] = x
            nexus_maps.append( nexus_map )

    if options.filter_by_monophyly:
        monophyly_taxa = options.filter_by_monophyly.split(",")
        if len(monophyly_taxa) == 0:
            raise "please supply at least two taxa for the monophyly test."
            
    if options.output_filename_map:
        outfile_map = open(options.output_filename_map, "a" )
    else:
        outfile_map = None

    for tree in nexus.trees:

        ninput += 1
        id = tree.name
        has_discarded = False

        if options.filter_ntaxa != None:

            ntaxa = len(tree.get_terminals())
            if ntaxa != options.filter_ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \
                                         (id, ntaxa ) )
                has_discarded = True
                
        if options.filter_simple_orthologs:
            ntaxa = len(tree.get_terminals())
            nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() )))
            if nspecies != ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \
                                             (id, ntaxa, nspecies ) )

                has_discarded = True

        if options.filter_terminal_max_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength >= options.filter_terminal_max_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )

        if options.filter_terminal_min_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength <= options.filter_terminal_min_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )
                    
        if options.filter_max_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue                
                node = tree.node(x)
                if node.data.branchlength >= options.filter_max_length:
                    has_discarded = True
                    ndiscarded_branches += 1                    
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \
                                             (id, x, tree.name, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_min_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                node = tree.node(x)
                if node.data.branchlength <= options.filter_min_length:
                    has_discarded = True
                    ndiscarded_branches += 1
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \
                                             (id, x, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_by_trees:
            found = []
            for y in range(len(nexus_maps)):
                if id in nexus_maps[y]:
                    found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) )

            if not found:
                ndiscarded += 1
                continue

            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                for y, other_tree in found:
                    other_node = other_tree.node( x )
                    if other_node.data.branchlength == options.filtered_branch_length:
                        node = tree.node(x)
                        if options.loglevel >= 2:
                            options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \
                                                 (id, x, y, other_tree.name) )
                        
                        node.data.branchlength = options.filtered_branch_length
                        has_discarded = True
                        ndiscarded_branches += 1
                        break

        if options.filter_by_monophyly:

            terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals()))
            
            for t in monophyly_taxa:
                if t not in terminals:
                    if options.loglevel >= 2:
                        options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name))
                    nskipped += 1
            succ = tree.node(tree.root).succ
            ## use minimum support at root, if it is not the same (if trees
            ## are rooted)
            if len(succ) == 2:
                m = min( map( lambda x: tree.node(x).data.support, succ) )
                for x in succ:
                    tree.node(x).data.support = m
                
            if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ):
                ndiscarded += 1
                continue
            
        if has_discarded:
            ndiscarded += 1
            if options.filter=="trees" or options.filter_ntaxa:
                continue

        if options.method == "split":

            output_filename = re.sub( "%s", id, options.output_pattern )

            dirname = os.path.dirname(output_filename)

            if dirname and not os.path.exists( dirname ):
                os.makedirs( dirname )

            if not os.path.exists( output_filename ):
                outfile = open(output_filename, "w" )
                outfile.write( TreeTools.Tree2Newick( tree ) + "\n" )
                noutput += 1
            else:
                if options.loglevel >= 1:
                    options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename))                        
                nskipped += 1
                continue

        elif options.method == "filter":
            options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) )
            noutput += 1
            
        if outfile_map:
            for t in TreeTools.GetTaxa( tree ):
                outfile_map.write( "%s\t%s\n" % (t, id) )

    if outfile_map:
        outfile_map.close()

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\
                             (ninput, noutput, nskipped,
                              ndiscarded, ndiscarded_taxa, ndiscarded_branches))
        
    E.Stop()
Ejemplo n.º 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()
Ejemplo n.º 7
0
    def run(self):

        self.prepareRun()

        if not self.mProgram:
            raise UsageError("no program specified.")

        s = subprocess.Popen("%s" % (self.mProgram),
                             shell=True,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=self.mTempdir,
                             close_fds=True)

        (out, err) = s.communicate("\n".join(self.mOptions) + "\n")

        if s.returncode != 0:
            raise UsageError, "Error in running phylip.\n%s\n%s\nTemporary directory was %s" % (
                out, err, self.mTempdir)

        # Parse output files that might have been created:
        result = PhylipResult()

        # parse tree file
        if os.path.exists("%s/outtree" % self.mTempdir):

            nexus = TreeTools.Newick2Nexus(
                open("%s/outtree" % self.mTempdir, "r"))
            for tree in nexus.trees:
                TreeTools.MapTaxa(tree, self.mMapPhylip2Input)
            result.mNexus = nexus
            if self.mLogLevel >= 1:
                print "# received tree with %i taxa" % (len(
                    TreeTools.GetTaxa(nexus.trees[0])))

        elif os.path.exists("%s/outfile" % self.mTempdir):

            if self.mProgram in ("dnadist", "protdist"):
                infile = open("%s/outfile" % self.mTempdir, "r")
                result.mMatrix, row_headers, col_headers = MatlabTools.readMatrix(
                    infile, format="phylip")
                result.mRowHeaders = []
                for x in row_headers:
                    result.mRowHeaders.append(self.mMapPhylip2Input[x])
                result.mColHeaders = result.mRowHeaders
            elif self.mProgram == "contrast":

                infile = open("%s/outfile" % self.mTempdir, "r")
                result.parseContrasts(infile)
                infile.close()

        else:
            raise "other return types not implemented"

        if self.mLogLevel >= 2:
            print out

        if self.mLogLevel == 0:
            shutil.rmtree(self.mTempdir)

        return result
Ejemplo n.º 8
0
    def prepareRun(self):

        self.__reset()

        self.mTempdir = tempfile.mkdtemp()
        # self.mTempdir = "tmp"
        if not os.path.exists(self.mTempdir):
            os.mkdir(self.mTempdir)

        if self.mInputMatrix and self.mInputData:
            raise ValueError(
                "please specify either input matrix or input data, but not both."
            )

        # prepare input matrix. Should already be in phylip like
        # format, but long identifiers are shortened and tabs are
        # replaced by spaces.
        if self.mInputMatrix:

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = map(lambda x: re.split("\s+", x[:-1])[0],
                              self.mInputMatrix[1:])
            self.updateMaps(identifiers)

            outfile.write(self.mInputMatrix[0])
            for line in self.mInputMatrix[1:]:
                data = re.split("\s+", line[:-1])
                new_line = self.mMapInput2Phylip[
                    data[0]] + "       " + "  ".join(data[1:])
                outfile.write(new_line + "\n")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        elif self.mInputData:

            outfile = open(self.mTempdir + "/infile", "w")
            outfile.write("%i %i\n" %
                          (len(self.mInputData), len(self.mInputData[0]) - 1))
            identifiers = map(lambda x: x[0], self.mInputData)
            self.updateMaps(identifiers)

            for x in range(len(identifiers)):
                outfile.write("%-10s %s\n" %
                              (self.mMapInput2Phylip[identifiers[x]], " ".join(
                                  self.mInputData[x][1:])))

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        # prepare input tree or trees
        self.mNInputTrees = 0
        if self.mInputTree or self.mInputTrees:

            outfile = open(self.mTempdir + "/intree", "w")

            if self.mInputTree and self.mInputTrees:
                raise UsageError(
                    "please supply either one or mupltiple trees, but not both."
                )

            if self.mInputTree:
                trees = [self.mInputTree]
            else:
                trees = self.mInputTrees

            for tree in trees:
                if self.mPruneTree:
                    taxa = self.mMapInput2Phylip.keys()
                    TreeTools.PruneTree(tree, taxa)

                taxa = TreeTools.GetTaxa(tree)
                self.updateMaps(taxa)
                TreeTools.MapTaxa(tree, self.mMapInput2Phylip)

                # check if taxa are unique
                taxa = tree.get_taxa()
                staxa = set()

                skip = False
                for t in taxa:
                    if t in staxa:
                        if self.mLogLevel >= 1:
                            print "# skipping tree %s because of duplicate taxa." % (
                                tree.name)
                        skip = True
                    staxa.add(t)

                if skip:
                    continue

                outfile.write(TreeTools.Tree2Newick(tree) + "\n")
                self.mNInputTrees += 1

                if self.mLogLevel >= 1:
                    print "# written input tree with %i taxa to %s" % (len(
                        TreeTools.GetTaxa(tree)), self.mTempdir + "/intree")
                    print "#", TreeTools.Tree2Newick(tree)

            outfile.close()

        # prepare input multiple alignment
        if self.mInputMali:

            if self.mInputMatrix:
                raise "both mali and matrix supplied - infile conflict."

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = self.mInputMali.getIdentifiers()
            self.updateMaps(identifiers)
            self.mInputMali.mapIdentifiers(self.mMapInput2Phylip)
            self.mInputMali.writeToFile(outfile, format="phylip")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input multiple alignments with %i taxa and with %i to %s" %\
                      (self.mInputMali.getLength(),
                       self.mInputMali.getWidth(), self.mTempdir + "/intree")