Ejemplos de TreeTools.GetTaxa en Python

Lenguaje de programación: Python

Namespace/Package Name: CGAT

Clase / Tipo: TreeTools

Método / Función: GetTaxa

Ejemplos en hotexamples.com: 8

Python TreeTools.GetTaxa - 8 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de CGAT.TreeTools.GetTaxa extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Newick2Nexus(30)

Tree2Newick(9)

GetTaxa(8)

PruneTree(5)

GetSize(5)

Nexus2Newick(5)

MapTaxa(4)

TreeDFS(3)

GetSubsets(3)

Newick2Tree(3)

calculatePatternsFromTree(2)

GetLeaves(2)

IsCompatible(2)

GetDistanceToRoot(1)

IsMonophyleticForTaxa(1)

GetNodeMap(1)

Tree2Graph(1)

GetMaxIndex(1)

Unroot(1)

GetAllNodes(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: TreeReconciliation.py Proyecto: santayana/cgat

def rerootTree(gene_tree, extract_species, options):

    otus = TreeTools.GetTaxa(gene_tree)

    # find monophyletic trees of outgroup_species
    try:
        outgroup_taxa = filter(
            lambda x: extract_species(x) in options.outgroup_species, otus)
    except AttributeError:
        raise "error while rerooting tree in tree %s with %s" % (
            gene_tree.name, str(otus))

    if gene_tree.is_monophyletic(outgroup_taxa):
        r = outgroup_taxa
    else:
        r = [outgroup_taxa[0], ]

    if r:
        if options.loglevel >= 1:
            options.stdlog.write("# tree %s: rerooting with %i outgroups:  %s.\n" % (
                gene_tree.name, len(r), ",".join(r)))
            options.stdlog.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: no outgroup found, tree will not be rerooted.\n" % gene_tree.name)
            options.stdlog.flush()

    gene_tree.root_with_outgroup(r)

    if options.loglevel >= 5:
        gene_tree.display()

Ejemplo n.º 2

Mostrar archivo

Archivo: tree2taxa.py Proyecto: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--skip-trees",
        dest="skip_trees",
        action="store_true",
        help="do not output tree names in third field [default=%default].")

    parser.set_defaults(skip_trees=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ntree = 0
    ntotal = len(nexus.trees)

    if ntotal == 1:
        options.stdout.write("taxon\n")
    else:
        if options.skip_trees:
            options.stdout.write("taxon\ttree\n")
        else:
            options.stdout.write("taxon\ttree\tname\n")

    for tree in nexus.trees:
        ntree += 1
        taxa = TreeTools.GetTaxa(tree)

        if ntotal == 1:
            for t in taxa:
                options.stdout.write("%s\n" % (t))
        elif options.skip_trees:
            for t in taxa:
                options.stdout.write("%s\t%i\n" % (t, ntree))
        else:
            for t in taxa:
                options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name))

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i\n" % (ntotal))

    E.Stop()

Ejemplo n.º 3

Mostrar archivo

Archivo: TreeReconciliation.py Proyecto: santayana/cgat

def filterTree(tree, options, map_id2location=None):
    """apply location and type filter to tree.

    if outgroups are defined, they are not removed.
    """

    otus = TreeTools.GetTaxa(tree)

    to_remove = set()
    if options.remove_unplaced:
        tt = set()
        for id in otus:
            if id not in map_id2location:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: unknown location for id %s.\n" % id)
                continue

            if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK:
                to_remove.add(id)
                tt.add(id)

        if options.loglevel >= 3:
            options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" %
                                 (tree.name, len(tt), ";".join(tt)))

    new_otus = list(set(otus).difference(to_remove))

    if len(new_otus) != len(otus):

        TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True)

    if options.loglevel >= 1:
        options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" %
                             (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree))))
        options.stdlog.flush()

Ejemplo n.º 4

Mostrar archivo

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method to use [counts|lists|hists|links].")

    parser.add_option("-o",
                      "--filename-output",
                      dest="filename_output",
                      type="string",
                      help="output filename.")

    parser.add_option("-f",
                      "--functions",
                      dest="functions",
                      type="string",
                      help="functions to grep [functional|pseudo|all].")

    parser.add_option("-l",
                      "--locations",
                      dest="locations",
                      type="string",
                      help="locations to grep [local|nojunk|all|...].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("-i",
                      "--fit",
                      dest="fit",
                      type="string",
                      help="fitting method [decay|power]")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--use-relative-height",
                      dest="use_relative_height",
                      action="store_true",
                      help="use relative height values.")

    parser.add_option(
        "--reverse",
        dest="reverse",
        action="store_true",
        help="""reverse species. Histograms will show the age of duplications for
                      duplicates in other genomes.""")

    parser.set_defaults(species="",
                        functions="functional,pseudo,all",
                        locations="local,nojunk,all",
                        filename_output=None,
                        bin_size=1.0,
                        min_value=None,
                        max_value=None,
                        nonnull=None,
                        use_relative_height=False,
                        header=True,
                        fit=None,
                        reverse=False,
                        method="counts")

    (options, args) = E.Start(parser, add_psql_options=True)

    options.species = options.species.split(",")
    options.locations = options.locations.split(",")
    options.functions = options.functions.split(",")

    if len(options.species) == 0:
        raise "please supply list of species."

    dbhandle = pgdb.connect(options.psql_connection)

    input_data = map(lambda x: x[:-1].split("\t"),
                     filter(lambda x: x[0] != "#", sys.stdin.readlines()))

    ## remove header
    if options.header:
        del input_data[0]

    ## decide which columns to take
    ## 1st column: species1: this is the species in which duplications have occured.
    ## 2nd column: species2: this is the species with respect to which duplications occured.
    ## 3rd column: clusterid
    ## 4th column: chromosomes
    ## 5th column: function
    ## 6th column: height
    ## 7th column: relative height
    ## 8th column: locations
    ## 9th column: tree
    if options.use_relative_height:
        take = (0, 1, 2, 3, 4, 6, 7, 8)
    else:
        take = (0, 1, 2, 3, 4, 5, 7, 8)

    for x in range(len(input_data)):
        input_data[x] = tuple([input_data[x][y] for y in take])

    map_pos2species = []
    map_species2pos = {}
    for x in range(len(options.species)):
        map_species2pos[options.species[x]] = x
        map_pos2species.append(options.species[x])

    outfile = None

    if options.method in ("counts", "medians"):

        if options.method == "counts":
            func = len
        elif options.method == "medians":
            func = numpy.median

        for location in options.locations:

            for function in options.functions:
                matrix = numpy.zeros(
                    (len(options.species), len(options.species)), numpy.Float)

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None
                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        if len(values) > 0:
                            matrix[map_species2pos[last_species1],
                                   map_species2pos[last_species2]] = func(
                                       values)

                        values = []
                        last_species1 = species1
                        last_species2 = species2

                    values.append(float(height))

                if len(values) > 0:
                    matrix[map_species2pos[last_species1],
                           map_species2pos[last_species2]] = func(values)

                if options.filename_output:
                    dict = {"f": function, "l": location}
                    outfile = open(options.filename_output % dict, "w")
                else:
                    outfile = sys.stdout
                    outfile.write(
                        "matrix for method %s: location: %s, function: %s\n" %
                        (options.method, location, function))

                if options.method == "medians":
                    format = "%6.4f"
                elif options.method == "counts":
                    format = "%i"
                MatlabTools.WriteMatrix(matrix,
                                        outfile=outfile,
                                        format=format,
                                        row_headers=options.species,
                                        col_headers=options.species)

                if options.filename_output:
                    outfile.close()

    elif options.method in ("lists", "lists-union"):
        ## write lists of duplicated genes in species1 as compared to species2
        ##      according to location/function
        ## First field : gene name
        ## Second field: cluster id
        ## Third field : number of other genes in cluster
        ## Fourth field: location of gene
        written = {}
        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None

                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if options.method == "lists":
                                if outfile: outfile.close()
                                dict = {
                                    "f": function,
                                    "l": location,
                                    "s": species1,
                                    "o": species2
                                }
                                written = {}
                                outfile = open(options.filename_output % dict,
                                               "w")
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    if outfile: outfile.close()
                                    dict = {
                                        "f": function,
                                        "l": location,
                                        "s": species1
                                    }
                                    written = {}
                                    outfile = open(
                                        options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            if options.method == "lists":
                                outfile.write(
                                    "location: %s, function: %s, species1: %s, species2: %s\n"
                                    % (location, function, species1, species2))
                                written = {}
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    outfile.write(
                                        "location: %s, function: %s, species1: %s\n"
                                        % (location, function, species1))
                                    written = {}

                        last_species1 = species1
                        last_species2 = species2

                    # get tree
                    tt = TreeTools.Newick2Tree(tree)
                    taxa = TreeTools.GetTaxa(tt)
                    for t in taxa:
                        if t in written: continue
                        outfile.write("%s\t%s\t%i\n" %
                                      (t, cluster_id, len(taxa)))
                        written[t] = 1

    elif options.method in ("hists", "fit-decay"):

        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                data.sort()

                ################################################################
                ## convert to matrix of list
                ## values[x][y] contains heights of duplications in species x with reference to y

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    try:
                        values[map_species2pos[species1]][
                            map_species2pos[species2]].append(float(height))
                    except KeyError:
                        continue

                ################################################################
                ################################################################
                ################################################################
                # calculate histograms per species
                ################################################################
                for s in options.species:
                    histograms = []
                    headers = []

                    if options.filename_output:
                        dict = {"f": function, "l": location, "s": s}
                        outfile = open(options.filename_output % dict, "w")
                    else:
                        outfile = sys.stdout
                        outfile.write("location: %s, function: %s\n" %
                                      (location, function))

                    for x in range(len(options.species)):

                        if options.reverse:
                            ## duplications in species x
                            vv = values[x][map_species2pos[s]]
                        else:
                            ## duplications in species s
                            vv = values[map_species2pos[s]][x]

                        if len(vv) == 0:
                            pass
                        else:
                            headers.append(options.species[x])
                            h = Histogram.Calculate(
                                vv,
                                increment=options.bin_size,
                                min_value=options.min_value,
                                max_value=options.max_value,
                                no_empty_bins=True)

                            if options.method == "fit-decay":
                                result = fit(h, [2.0, -1.0])
                                if result:
                                    outfile.write(
                                        "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n"
                                        % (
                                            "function",
                                            s,
                                            options.species[x],
                                            h[0][1],
                                            result[0],
                                            result[1],
                                            result[0],
                                            result[1],
                                        ))
                            elif options.method == "hists":
                                histograms.append(h)

                    if options.method == "hists":
                        combined_histogram = Histogram.Combine(
                            histograms, missing_value="-")

                        outfile.write("bin\t" + "\t".join(headers) + "\n")
                        Histogram.Write(outfile, combined_histogram)

                    if options.filename_output:
                        outfile.close()
                    else:
                        outfile.flush()

    elif options.method == "pairs":

        ## get branches with 0 branchlength

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                data.sort()
                last_species1, last_species2, last_cluster_id = None, None, None

                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if outfile: outfile.close()
                            dict = {
                                "f": function,
                                "l": location,
                                "s": species1,
                                "o": species2
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, species1, species2))

                        last_species1 = species1
                        last_species2 = species2
                        last_cluster_id = None

                    if last_cluster_id != cluster_id:
                        if last_cluster_id != None:
                            pass

                        last_cluster_id = cluster_id

                    outfile.write("%s\t%s\t%s\t%s\n" %
                                  (cluster_id, height, locations, tree))

    elif options.method == "links":

        ## write a tree for each species pair:
        ## each node is a gene+location, the weight of the vertex is the height
        ## further info added: cluster_id for the duplication

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                ## stores duplications within first species as compared to second species
                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    values[map_species2pos[species1]][
                        map_species2pos[species2]].append(
                            (cluster_id, -len(locations), locations, tree))

                # get links per species
                for s in options.species:
                    if options.loglevel >= 2:
                        options.stdlog.write("#     processing species %s\n" %
                                             s)

                    headers = []
                    for x in range(len(options.species)):

                        if map_pos2species[x] == s: continue

                        vv = values[map_species2pos[s]][x]
                        vv.sort()

                        ## write trees per cluster
                        if options.filename_output:
                            dict = {
                                "f": function,
                                "l": location,
                                "s": s,
                                "o": map_pos2species[x]
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, s, map_pos2species[x]))

                        ## only print out largest tree
                        last_cluster_id = None
                        for cluster_id, n, locations, tree in vv:
                            if cluster_id != last_cluster_id:
                                outfile.write("%s\t%s\t%s\n" %
                                              (cluster_id, locations, tree))
                                last_cluster_id = cluster_id

                        if options.filename_output:
                            outfile.close()

    E.Stop()

Ejemplo n.º 5

Mostrar archivo

Archivo: trees2trees.py Proyecto: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string",
                      help="filename of map to output."  )

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("filter", "split"),
                      help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default"  )

    parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string",
                      help="filename pattern for output multiple alignment files."  )

    parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float",
                      help="remove terminal branches with a branch length larger than this."  )

    parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-min-length", dest="filter_min_length", type="float",
                      help="remove terminal branches with a branch length smaller than this."  )

    parser.add_option("--filter-max-length", dest="filter_min_length", type="float",
                      help="remove any branches with a branch length smaller than this."  )

    parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append",
                      help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order."  )

    parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string",
                      help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list."  )

    parser.add_option("--min-support", dest="min_support", type="float",
                      help="for monophyly filtering, only accept trees with minimum support."  )

    parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", 
                      help="filter by number of taxa."  )

    parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", 
                      help="filter for trees for simple orhtologs. This works by counting the number of taxa."  )

    parser.add_option("--filter", dest="filter", type="choice",
                      choices=("taxa", "trees"),
                      help="filter removes taxa or whole trees." )

    parser.set_defaults(
        output_pattern="%s.tree",
        output_filename_map = None,
        filter_terminal_max_length = None,
        filter_terminal_min_length = None,
        filter_max_length = None,
        filter_min_length = None,
        method ="split",
        filter = "taxa",
        filtered_branch_length = -999,
        filter_by_trees = [],
        filter_by_monophyly = None,
        filter_ntaxa = None,
        filter_simple_orthologs = None,
        min_support = 0.0,
        regex_species = ("^([^|]+)" ),
        )

    (options, args) = E.Start( parser )

    nexus = TreeTools.Newick2Nexus( sys.stdin )
    
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees))

    ninput, noutput, nskipped = 0, 0, 0
    ndiscarded = 0
    ndiscarded_taxa = 0
    ndiscarded_branches = 0

    extract_species = lambda x: re.search( options.regex_species, x).groups()[0]
    
    if options.filter_by_trees:
        nexus_filter = []
        nexus_maps = []
        for filename in options.filter_by_trees:
            nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) )
            trees = nexus_filter[-1].trees
            if options.loglevel >=1 :
                options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename))

            nexus_map = {}
            for x in range( len(trees)):
                nexus_map[trees[x].name] = x
            nexus_maps.append( nexus_map )

    if options.filter_by_monophyly:
        monophyly_taxa = options.filter_by_monophyly.split(",")
        if len(monophyly_taxa) == 0:
            raise "please supply at least two taxa for the monophyly test."
            
    if options.output_filename_map:
        outfile_map = open(options.output_filename_map, "a" )
    else:
        outfile_map = None

    for tree in nexus.trees:

        ninput += 1
        id = tree.name
        has_discarded = False

        if options.filter_ntaxa != None:

            ntaxa = len(tree.get_terminals())
            if ntaxa != options.filter_ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \
                                         (id, ntaxa ) )
                has_discarded = True
                
        if options.filter_simple_orthologs:
            ntaxa = len(tree.get_terminals())
            nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() )))
            if nspecies != ntaxa:
                if options.loglevel >= 2:
                    options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \
                                             (id, ntaxa, nspecies ) )

                has_discarded = True

        if options.filter_terminal_max_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength >= options.filter_terminal_max_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )

        if options.filter_terminal_min_length != None:
            for x in tree.get_terminals():
                node = tree.node(x)
                if node.data.branchlength <= options.filter_terminal_min_length:
                    has_discarded = True
                    ndiscarded_taxa += 1                    
                    tree.prune( node.data.taxon )
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \
                                             (id, node.data.taxon, str(node.data.branchlength)) )
                    
        if options.filter_max_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue                
                node = tree.node(x)
                if node.data.branchlength >= options.filter_max_length:
                    has_discarded = True
                    ndiscarded_branches += 1                    
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \
                                             (id, x, tree.name, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_min_length != None:
            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                node = tree.node(x)
                if node.data.branchlength <= options.filter_min_length:
                    has_discarded = True
                    ndiscarded_branches += 1
                    if options.loglevel >= 2:
                        options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \
                                             (id, x, str(node.data.branchlength)) )
                    node.data.branchlength = options.filtered_branch_length
                    
        if options.filter_by_trees:
            found = []
            for y in range(len(nexus_maps)):
                if id in nexus_maps[y]:
                    found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) )

            if not found:
                ndiscarded += 1
                continue

            for x in tree.get_nodes(tree.root):
                if x == tree.root: continue
                for y, other_tree in found:
                    other_node = other_tree.node( x )
                    if other_node.data.branchlength == options.filtered_branch_length:
                        node = tree.node(x)
                        if options.loglevel >= 2:
                            options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \
                                                 (id, x, y, other_tree.name) )
                        
                        node.data.branchlength = options.filtered_branch_length
                        has_discarded = True
                        ndiscarded_branches += 1
                        break

        if options.filter_by_monophyly:

            terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals()))
            
            for t in monophyly_taxa:
                if t not in terminals:
                    if options.loglevel >= 2:
                        options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name))
                    nskipped += 1
            succ = tree.node(tree.root).succ
            ## use minimum support at root, if it is not the same (if trees
            ## are rooted)
            if len(succ) == 2:
                m = min( map( lambda x: tree.node(x).data.support, succ) )
                for x in succ:
                    tree.node(x).data.support = m
                
            if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ):
                ndiscarded += 1
                continue
            
        if has_discarded:
            ndiscarded += 1
            if options.filter=="trees" or options.filter_ntaxa:
                continue

        if options.method == "split":

            output_filename = re.sub( "%s", id, options.output_pattern )

            dirname = os.path.dirname(output_filename)

            if dirname and not os.path.exists( dirname ):
                os.makedirs( dirname )

            if not os.path.exists( output_filename ):
                outfile = open(output_filename, "w" )
                outfile.write( TreeTools.Tree2Newick( tree ) + "\n" )
                noutput += 1
            else:
                if options.loglevel >= 1:
                    options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename))                        
                nskipped += 1
                continue

        elif options.method == "filter":
            options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) )
            noutput += 1
            
        if outfile_map:
            for t in TreeTools.GetTaxa( tree ):
                outfile_map.write( "%s\t%s\n" % (t, id) )

    if outfile_map:
        outfile_map.close()

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\
                             (ninput, noutput, nskipped,
                              ndiscarded, ndiscarded_taxa, ndiscarded_branches))
        
    E.Stop()

Ejemplo n.º 6

Mostrar archivo

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()

Ejemplo n.º 7

Mostrar archivo

    def run(self):

        self.prepareRun()

        if not self.mProgram:
            raise UsageError("no program specified.")

        s = subprocess.Popen("%s" % (self.mProgram),
                             shell=True,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=self.mTempdir,
                             close_fds=True)

        (out, err) = s.communicate("\n".join(self.mOptions) + "\n")

        if s.returncode != 0:
            raise UsageError, "Error in running phylip.\n%s\n%s\nTemporary directory was %s" % (
                out, err, self.mTempdir)

        # Parse output files that might have been created:
        result = PhylipResult()

        # parse tree file
        if os.path.exists("%s/outtree" % self.mTempdir):

            nexus = TreeTools.Newick2Nexus(
                open("%s/outtree" % self.mTempdir, "r"))
            for tree in nexus.trees:
                TreeTools.MapTaxa(tree, self.mMapPhylip2Input)
            result.mNexus = nexus
            if self.mLogLevel >= 1:
                print "# received tree with %i taxa" % (len(
                    TreeTools.GetTaxa(nexus.trees[0])))

        elif os.path.exists("%s/outfile" % self.mTempdir):

            if self.mProgram in ("dnadist", "protdist"):
                infile = open("%s/outfile" % self.mTempdir, "r")
                result.mMatrix, row_headers, col_headers = MatlabTools.readMatrix(
                    infile, format="phylip")
                result.mRowHeaders = []
                for x in row_headers:
                    result.mRowHeaders.append(self.mMapPhylip2Input[x])
                result.mColHeaders = result.mRowHeaders
            elif self.mProgram == "contrast":

                infile = open("%s/outfile" % self.mTempdir, "r")
                result.parseContrasts(infile)
                infile.close()

        else:
            raise "other return types not implemented"

        if self.mLogLevel >= 2:
            print out

        if self.mLogLevel == 0:
            shutil.rmtree(self.mTempdir)

        return result

Ejemplo n.º 8

Mostrar archivo

    def prepareRun(self):

        self.__reset()

        self.mTempdir = tempfile.mkdtemp()
        # self.mTempdir = "tmp"
        if not os.path.exists(self.mTempdir):
            os.mkdir(self.mTempdir)

        if self.mInputMatrix and self.mInputData:
            raise ValueError(
                "please specify either input matrix or input data, but not both."
            )

        # prepare input matrix. Should already be in phylip like
        # format, but long identifiers are shortened and tabs are
        # replaced by spaces.
        if self.mInputMatrix:

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = map(lambda x: re.split("\s+", x[:-1])[0],
                              self.mInputMatrix[1:])
            self.updateMaps(identifiers)

            outfile.write(self.mInputMatrix[0])
            for line in self.mInputMatrix[1:]:
                data = re.split("\s+", line[:-1])
                new_line = self.mMapInput2Phylip[
                    data[0]] + "       " + "  ".join(data[1:])
                outfile.write(new_line + "\n")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        elif self.mInputData:

            outfile = open(self.mTempdir + "/infile", "w")
            outfile.write("%i %i\n" %
                          (len(self.mInputData), len(self.mInputData[0]) - 1))
            identifiers = map(lambda x: x[0], self.mInputData)
            self.updateMaps(identifiers)

            for x in range(len(identifiers)):
                outfile.write("%-10s %s\n" %
                              (self.mMapInput2Phylip[identifiers[x]], " ".join(
                                  self.mInputData[x][1:])))

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        # prepare input tree or trees
        self.mNInputTrees = 0
        if self.mInputTree or self.mInputTrees:

            outfile = open(self.mTempdir + "/intree", "w")

            if self.mInputTree and self.mInputTrees:
                raise UsageError(
                    "please supply either one or mupltiple trees, but not both."
                )

            if self.mInputTree:
                trees = [self.mInputTree]
            else:
                trees = self.mInputTrees

            for tree in trees:
                if self.mPruneTree:
                    taxa = self.mMapInput2Phylip.keys()
                    TreeTools.PruneTree(tree, taxa)

                taxa = TreeTools.GetTaxa(tree)
                self.updateMaps(taxa)
                TreeTools.MapTaxa(tree, self.mMapInput2Phylip)

                # check if taxa are unique
                taxa = tree.get_taxa()
                staxa = set()

                skip = False
                for t in taxa:
                    if t in staxa:
                        if self.mLogLevel >= 1:
                            print "# skipping tree %s because of duplicate taxa." % (
                                tree.name)
                        skip = True
                    staxa.add(t)

                if skip:
                    continue

                outfile.write(TreeTools.Tree2Newick(tree) + "\n")
                self.mNInputTrees += 1

                if self.mLogLevel >= 1:
                    print "# written input tree with %i taxa to %s" % (len(
                        TreeTools.GetTaxa(tree)), self.mTempdir + "/intree")
                    print "#", TreeTools.Tree2Newick(tree)

            outfile.close()

        # prepare input multiple alignment
        if self.mInputMali:

            if self.mInputMatrix:
                raise "both mali and matrix supplied - infile conflict."

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = self.mInputMali.getIdentifiers()
            self.updateMaps(identifiers)
            self.mInputMali.mapIdentifiers(self.mMapInput2Phylip)
            self.mInputMali.writeToFile(outfile, format="phylip")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input multiple alignments with %i taxa and with %i to %s" %\
                      (self.mInputMali.getLength(),
                       self.mInputMali.getWidth(), self.mTempdir + "/intree")