def rerootTree(gene_tree, extract_species, options): otus = TreeTools.GetTaxa(gene_tree) # find monophyletic trees of outgroup_species try: outgroup_taxa = filter( lambda x: extract_species(x) in options.outgroup_species, otus) except AttributeError: raise "error while rerooting tree in tree %s with %s" % ( gene_tree.name, str(otus)) if gene_tree.is_monophyletic(outgroup_taxa): r = outgroup_taxa else: r = [outgroup_taxa[0], ] if r: if options.loglevel >= 1: options.stdlog.write("# tree %s: rerooting with %i outgroups: %s.\n" % ( gene_tree.name, len(r), ",".join(r))) options.stdlog.flush() else: if options.loglevel >= 1: options.stdlog.write( "# tree %s: no outgroup found, tree will not be rerooted.\n" % gene_tree.name) options.stdlog.flush() gene_tree.root_with_outgroup(r) if options.loglevel >= 5: gene_tree.display()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--skip-trees", dest="skip_trees", action="store_true", help="do not output tree names in third field [default=%default].") parser.set_defaults(skip_trees=False) (options, args) = E.Start(parser, add_pipe_options=True) nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ntree = 0 ntotal = len(nexus.trees) if ntotal == 1: options.stdout.write("taxon\n") else: if options.skip_trees: options.stdout.write("taxon\ttree\n") else: options.stdout.write("taxon\ttree\tname\n") for tree in nexus.trees: ntree += 1 taxa = TreeTools.GetTaxa(tree) if ntotal == 1: for t in taxa: options.stdout.write("%s\n" % (t)) elif options.skip_trees: for t in taxa: options.stdout.write("%s\t%i\n" % (t, ntree)) else: for t in taxa: options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name)) if options.loglevel >= 1: options.stdlog.write("# ntotal=%i\n" % (ntotal)) E.Stop()
def filterTree(tree, options, map_id2location=None): """apply location and type filter to tree. if outgroups are defined, they are not removed. """ otus = TreeTools.GetTaxa(tree) to_remove = set() if options.remove_unplaced: tt = set() for id in otus: if id not in map_id2location: if options.loglevel >= 1: options.stdlog.write( "# WARNING: unknown location for id %s.\n" % id) continue if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK: to_remove.add(id) tt.add(id) if options.loglevel >= 3: options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" % (tree.name, len(tt), ";".join(tt))) new_otus = list(set(otus).difference(to_remove)) if len(new_otus) != len(otus): TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True) if options.loglevel >= 1: options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" % (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree)))) options.stdlog.flush()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [counts|lists|hists|links].") parser.add_option("-o", "--filename-output", dest="filename_output", type="string", help="output filename.") parser.add_option("-f", "--functions", dest="functions", type="string", help="functions to grep [functional|pseudo|all].") parser.add_option("-l", "--locations", dest="locations", type="string", help="locations to grep [local|nojunk|all|...].") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--fit", dest="fit", type="string", help="fitting method [decay|power]") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--use-relative-height", dest="use_relative_height", action="store_true", help="use relative height values.") parser.add_option( "--reverse", dest="reverse", action="store_true", help="""reverse species. Histograms will show the age of duplications for duplicates in other genomes.""") parser.set_defaults(species="", functions="functional,pseudo,all", locations="local,nojunk,all", filename_output=None, bin_size=1.0, min_value=None, max_value=None, nonnull=None, use_relative_height=False, header=True, fit=None, reverse=False, method="counts") (options, args) = E.Start(parser, add_psql_options=True) options.species = options.species.split(",") options.locations = options.locations.split(",") options.functions = options.functions.split(",") if len(options.species) == 0: raise "please supply list of species." dbhandle = pgdb.connect(options.psql_connection) input_data = map(lambda x: x[:-1].split("\t"), filter(lambda x: x[0] != "#", sys.stdin.readlines())) ## remove header if options.header: del input_data[0] ## decide which columns to take ## 1st column: species1: this is the species in which duplications have occured. ## 2nd column: species2: this is the species with respect to which duplications occured. ## 3rd column: clusterid ## 4th column: chromosomes ## 5th column: function ## 6th column: height ## 7th column: relative height ## 8th column: locations ## 9th column: tree if options.use_relative_height: take = (0, 1, 2, 3, 4, 6, 7, 8) else: take = (0, 1, 2, 3, 4, 5, 7, 8) for x in range(len(input_data)): input_data[x] = tuple([input_data[x][y] for y in take]) map_pos2species = [] map_species2pos = {} for x in range(len(options.species)): map_species2pos[options.species[x]] = x map_pos2species.append(options.species[x]) outfile = None if options.method in ("counts", "medians"): if options.method == "counts": func = len elif options.method == "medians": func = numpy.median for location in options.locations: for function in options.functions: matrix = numpy.zeros( (len(options.species), len(options.species)), numpy.Float) data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func( values) values = [] last_species1 = species1 last_species2 = species2 values.append(float(height)) if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func(values) if options.filename_output: dict = {"f": function, "l": location} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "matrix for method %s: location: %s, function: %s\n" % (options.method, location, function)) if options.method == "medians": format = "%6.4f" elif options.method == "counts": format = "%i" MatlabTools.WriteMatrix(matrix, outfile=outfile, format=format, row_headers=options.species, col_headers=options.species) if options.filename_output: outfile.close() elif options.method in ("lists", "lists-union"): ## write lists of duplicated genes in species1 as compared to species2 ## according to location/function ## First field : gene name ## Second field: cluster id ## Third field : number of other genes in cluster ## Fourth field: location of gene written = {} for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if options.method == "lists": if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } written = {} outfile = open(options.filename_output % dict, "w") elif options.method == "lists-union": if last_species1 != species1: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1 } written = {} outfile = open( options.filename_output % dict, "w") else: outfile = sys.stdout if options.method == "lists": outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) written = {} elif options.method == "lists-union": if last_species1 != species1: outfile.write( "location: %s, function: %s, species1: %s\n" % (location, function, species1)) written = {} last_species1 = species1 last_species2 = species2 # get tree tt = TreeTools.Newick2Tree(tree) taxa = TreeTools.GetTaxa(tt) for t in taxa: if t in written: continue outfile.write("%s\t%s\t%i\n" % (t, cluster_id, len(taxa))) written[t] = 1 elif options.method in ("hists", "fit-decay"): for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) data.sort() ################################################################ ## convert to matrix of list ## values[x][y] contains heights of duplications in species x with reference to y for species1, species2, cluster_id, l, f, height, locations, tree in data: try: values[map_species2pos[species1]][ map_species2pos[species2]].append(float(height)) except KeyError: continue ################################################################ ################################################################ ################################################################ # calculate histograms per species ################################################################ for s in options.species: histograms = [] headers = [] if options.filename_output: dict = {"f": function, "l": location, "s": s} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write("location: %s, function: %s\n" % (location, function)) for x in range(len(options.species)): if options.reverse: ## duplications in species x vv = values[x][map_species2pos[s]] else: ## duplications in species s vv = values[map_species2pos[s]][x] if len(vv) == 0: pass else: headers.append(options.species[x]) h = Histogram.Calculate( vv, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, no_empty_bins=True) if options.method == "fit-decay": result = fit(h, [2.0, -1.0]) if result: outfile.write( "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n" % ( "function", s, options.species[x], h[0][1], result[0], result[1], result[0], result[1], )) elif options.method == "hists": histograms.append(h) if options.method == "hists": combined_histogram = Histogram.Combine( histograms, missing_value="-") outfile.write("bin\t" + "\t".join(headers) + "\n") Histogram.Write(outfile, combined_histogram) if options.filename_output: outfile.close() else: outfile.flush() elif options.method == "pairs": ## get branches with 0 branchlength for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() data.sort() last_species1, last_species2, last_cluster_id = None, None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) last_species1 = species1 last_species2 = species2 last_cluster_id = None if last_cluster_id != cluster_id: if last_cluster_id != None: pass last_cluster_id = cluster_id outfile.write("%s\t%s\t%s\t%s\n" % (cluster_id, height, locations, tree)) elif options.method == "links": ## write a tree for each species pair: ## each node is a gene+location, the weight of the vertex is the height ## further info added: cluster_id for the duplication for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() ## stores duplications within first species as compared to second species values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] for species1, species2, cluster_id, l, f, height, locations, tree in data: values[map_species2pos[species1]][ map_species2pos[species2]].append( (cluster_id, -len(locations), locations, tree)) # get links per species for s in options.species: if options.loglevel >= 2: options.stdlog.write("# processing species %s\n" % s) headers = [] for x in range(len(options.species)): if map_pos2species[x] == s: continue vv = values[map_species2pos[s]][x] vv.sort() ## write trees per cluster if options.filename_output: dict = { "f": function, "l": location, "s": s, "o": map_pos2species[x] } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, s, map_pos2species[x])) ## only print out largest tree last_cluster_id = None for cluster_id, n, locations, tree in vv: if cluster_id != last_cluster_id: outfile.write("%s\t%s\t%s\n" % (cluster_id, locations, tree)) last_cluster_id = cluster_id if options.filename_output: outfile.close() E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"]) parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string", help="filename of map to output." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "split"), help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default" ) parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string", help="filename pattern for output multiple alignment files." ) parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float", help="remove terminal branches with a branch length larger than this." ) parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-min-length", dest="filter_min_length", type="float", help="remove terminal branches with a branch length smaller than this." ) parser.add_option("--filter-max-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append", help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order." ) parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string", help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list." ) parser.add_option("--min-support", dest="min_support", type="float", help="for monophyly filtering, only accept trees with minimum support." ) parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", help="filter by number of taxa." ) parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", help="filter for trees for simple orhtologs. This works by counting the number of taxa." ) parser.add_option("--filter", dest="filter", type="choice", choices=("taxa", "trees"), help="filter removes taxa or whole trees." ) parser.set_defaults( output_pattern="%s.tree", output_filename_map = None, filter_terminal_max_length = None, filter_terminal_min_length = None, filter_max_length = None, filter_min_length = None, method ="split", filter = "taxa", filtered_branch_length = -999, filter_by_trees = [], filter_by_monophyly = None, filter_ntaxa = None, filter_simple_orthologs = None, min_support = 0.0, regex_species = ("^([^|]+)" ), ) (options, args) = E.Start( parser ) nexus = TreeTools.Newick2Nexus( sys.stdin ) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ninput, noutput, nskipped = 0, 0, 0 ndiscarded = 0 ndiscarded_taxa = 0 ndiscarded_branches = 0 extract_species = lambda x: re.search( options.regex_species, x).groups()[0] if options.filter_by_trees: nexus_filter = [] nexus_maps = [] for filename in options.filter_by_trees: nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) ) trees = nexus_filter[-1].trees if options.loglevel >=1 : options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename)) nexus_map = {} for x in range( len(trees)): nexus_map[trees[x].name] = x nexus_maps.append( nexus_map ) if options.filter_by_monophyly: monophyly_taxa = options.filter_by_monophyly.split(",") if len(monophyly_taxa) == 0: raise "please supply at least two taxa for the monophyly test." if options.output_filename_map: outfile_map = open(options.output_filename_map, "a" ) else: outfile_map = None for tree in nexus.trees: ninput += 1 id = tree.name has_discarded = False if options.filter_ntaxa != None: ntaxa = len(tree.get_terminals()) if ntaxa != options.filter_ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \ (id, ntaxa ) ) has_discarded = True if options.filter_simple_orthologs: ntaxa = len(tree.get_terminals()) nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() ))) if nspecies != ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \ (id, ntaxa, nspecies ) ) has_discarded = True if options.filter_terminal_max_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength >= options.filter_terminal_max_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_terminal_min_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength <= options.filter_terminal_min_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_max_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength >= options.filter_max_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \ (id, x, tree.name, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_min_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength <= options.filter_min_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \ (id, x, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_by_trees: found = [] for y in range(len(nexus_maps)): if id in nexus_maps[y]: found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) ) if not found: ndiscarded += 1 continue for x in tree.get_nodes(tree.root): if x == tree.root: continue for y, other_tree in found: other_node = other_tree.node( x ) if other_node.data.branchlength == options.filtered_branch_length: node = tree.node(x) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \ (id, x, y, other_tree.name) ) node.data.branchlength = options.filtered_branch_length has_discarded = True ndiscarded_branches += 1 break if options.filter_by_monophyly: terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals())) for t in monophyly_taxa: if t not in terminals: if options.loglevel >= 2: options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name)) nskipped += 1 succ = tree.node(tree.root).succ ## use minimum support at root, if it is not the same (if trees ## are rooted) if len(succ) == 2: m = min( map( lambda x: tree.node(x).data.support, succ) ) for x in succ: tree.node(x).data.support = m if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ): ndiscarded += 1 continue if has_discarded: ndiscarded += 1 if options.filter=="trees" or options.filter_ntaxa: continue if options.method == "split": output_filename = re.sub( "%s", id, options.output_pattern ) dirname = os.path.dirname(output_filename) if dirname and not os.path.exists( dirname ): os.makedirs( dirname ) if not os.path.exists( output_filename ): outfile = open(output_filename, "w" ) outfile.write( TreeTools.Tree2Newick( tree ) + "\n" ) noutput += 1 else: if options.loglevel >= 1: options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename)) nskipped += 1 continue elif options.method == "filter": options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) ) noutput += 1 if outfile_map: for t in TreeTools.GetTaxa( tree ): outfile_map.write( "%s\t%s\n" % (t, id) ) if outfile_map: outfile_map.close() if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\ (ninput, noutput, nskipped, ndiscarded, ndiscarded_taxa, ndiscarded_branches)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extractspecies from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--filename-filter-positives", dest="filename_filter_positives", type="string", help="filename with positive list of trees to analyze.") parser.add_option("-s", "--filename-species-tree", dest="filename_species_tree", type="string", help="filename with species tree.") parser.add_option( "--filename-species2colour", dest="filename_species2colour", type="string", help= "filename with map of species to colours. If not given, random colours are assigned to species." ) parser.add_option("-t", "--species-tree", dest="species_tree", type="string", help="species tree.") parser.add_option( "-e", "--filename-locations", dest="filename_locations", type="string", help= "filename with map of transcript information to location information.") parser.add_option("--no-create", dest="create", action="store_false", help="do not create files, but append to them.") parser.add_option( "--max-separation", dest="max_separation", type="int", help= "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)." ) parser.add_option( "--filename-species2url", dest="filename_species2url", type="string", help="filename with mapping information of species to URL.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add as first column.") parser.add_option( "--outgroup-species", dest="outgroup_species", type="string", help="species to used as outgroups. Separate multiple species by ','.") parser.add_option("--subtrees-trees", dest="subtrees_trees", action="store_true", help="write trees for subtrees.") parser.add_option("--subtrees-identifiers", dest="subtrees_identifiers", action="store_true", help="write identifiers of subtrees.") parser.add_option("--svg-add-ids", dest="svg_add_ids", action="store_true", help="add node ids to svg plot.") parser.add_option("--svg-otus", dest="svg_otus", type="string", help="otus to output in svg species tree.") parser.add_option("--svg-branch-lenghts", dest="svg_branch_lengths", type="choice", choices=("contemporary", "uniform", "median"), help="branch lengths in species tree.") parser.add_option("--print-totals", dest="print_totals", action="store_true", help="output totals sections.") parser.add_option("--print-subtotals", dest="print_subtotals", action="store_true", help="output subtotals sections.") parser.add_option( "--print-best", dest="print_best", action="store_true", help="output best node assignment for each node in gene tree.") parser.add_option("--print-svg", dest="print_svg", action="store_true", help="output svg files.") parser.add_option("--print-species-svg", dest="print_species_svg", action="store_true", help="output species svg files.") parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= """output pattern for separate output of sections [default: %default]. Set to None, if output to stdout. Can contain one %s to be substituted with section.""" ) parser.add_option( "--output-pattern-svg", dest="output_pattern_svg", type="string", help= "filename for svg output. If it contains %s, this is replaced by gene_tree name." ) parser.add_option( "--filename-node-types", dest="filename_node_types", type="string", help="filename with node type information from a previous run.") parser.add_option("--analyze-resolution-data", dest="analyze_resolution_data", type="choice", action="append", choices=("stats", "histograms"), help="stdin is resolution data.") parser.add_option("--filter-quality", dest="filter_quality", type="choice", choices=("all", "genes", "pseudogenes"), help="filter predictions by gene type.") parser.add_option("--filter-location", dest="filter_location", type="choice", choices=("all", "local", "non-local", "cis", "unplaced"), help="filter predictions by location.") parser.add_option("--remove-unplaced", dest="remove_unplaced", action="store_true", help="remove predictions on unplaced contigs.") parser.add_option("--skip-without-outgroups", dest="skip_without_outgroups", action="store_true", help="skip clusters without outgroups.") parser.set_defaults( filter_quality="all", filter_location="all", remove_unplaced=False, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_species_tree=None, priority={ "Speciation": 0, "SpeciationDeletion": 1, "Transcripts": 2, "DuplicationLineage": 3, "Duplication": 4, "DuplicationDeletion": 5, "DuplicationInconsistency": 6, "Outparalogs": 7, "InconsistentTranscripts": 8, "Inconsistency": 9, "Masked": 10 }, species_tree=None, filename_species2colour=None, filename_locations=None, max_separation=0, filename_species2url=None, separator="|", prefix=None, output_pattern=None, output_pattern_svg=None, outgroup_species=None, svg_add_ids=False, svg_branch_lengths="median", svg_otus=None, subtrees=False, print_svg=False, print_subtotals=False, print_totals=False, print_best=False, subtrees_identifiers=False, create=True, min_branch_length=0.00, filename_node_types=None, format_branch_length="%6.4f", nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"), analyze_resolution_data=None, warning_small_branch_length=0.01, filename_filter_positives=None, skip_without_outgroups=False, ) (options, args) = E.Start(parser, add_psql_options=True, add_csv_options=True) if options.outgroup_species: options.outgroup_species = set(options.outgroup_species.split(",")) if options.svg_otus: options.svg_otus = set(options.svg_otus.split(",")) rx_species = re.compile(options.species_regex) extract_species = lambda x: rx_species.match(x).groups()[0] if options.gene_regex: rx_gene = re.compile(options.gene_regex) extract_gene = lambda x: rx_gene.match(x).groups()[0] else: extract_gene = None extract_quality = lambda x: x.split(options.separator)[3] ######################################################################### ######################################################################### ######################################################################### # read positive list of malis ######################################################################### if options.filename_filter_positives: filter_positives, nerrors = IOTools.ReadList( open(options.filename_filter_positives, "r")) filter_positives = set(filter_positives) else: filter_positives = None ######################################################################### ######################################################################### ######################################################################### # read location info ######################################################################### if options.filename_locations: map_id2location = TreeReconciliation.readLocations( open(options.filename_locations, "r"), extract_species) else: map_id2location = {} if (options.remove_unplaced or options.filter_location != "all" ) and not options.filename_locations: raise "please supply a file with location information." ######################################################################### ######################################################################### ######################################################################### # delete output files ######################################################################### if options.create and options.output_pattern: for section in ("details", "subtrees", "subids", "details", "trees", "nodes", "categories"): fn = options.output_pattern % section if os.path.exists(fn): if options.loglevel >= 1: options.stdlog.write("# deleting file %s.\n" % fn) os.remove(fn) if options.loglevel >= 1: options.stdlog.write("# reading gene trees.\n") options.stdlog.flush() gene_nexus = TreeTools.Newick2Nexus(sys.stdin) Tree.updateNexus(gene_nexus) if options.loglevel >= 1: options.stdlog.write("# read %i gene trees from stdin.\n" % len(gene_nexus.trees)) options.stdlog.flush() ######################################################################### ######################################################################### ######################################################################### # main loop over gene trees ######################################################################### ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0 nskipped_filter, nskipped_outgroups = 0, 0 # total counts total_heights_per_species = {} total_relheights_per_species = {} total_heights_per_tree = [] total_relheights_per_tree = [] for gene_tree in gene_nexus.trees: ninput += 1 xname = re.sub("_tree.*", "", gene_tree.name) xname = re.sub("subtree_", "", xname) if filter_positives and xname not in filter_positives: nskipped_filter += 1 continue if options.loglevel >= 6: gene_tree.display() ####################################################################### ####################################################################### ####################################################################### # get identifier for this tree and update prefixes accordingly ####################################################################### if options.prefix: if len(gene_nexus.trees) > 0: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + gene_tree.name + "\t" prefix_prefix = options.prefix + "_" + gene_tree.name + "_" prefix_name = options.prefix + "_" + gene_tree.name else: prefix_header = "prefix\t" prefix_row = options.prefix + "\t" prefix_prefix = options.prefix + "_" prefix_name = options.prefix else: if len(gene_nexus.trees) > 0: prefix_header = "prefix\t" prefix_row = gene_tree.name + "\t" prefix_prefix = gene_tree.name + "\t" prefix_name = gene_tree.name else: prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", "" ####################################################################### ####################################################################### ####################################################################### # apply filters to gene tree ####################################################################### TreeReconciliation.filterTree(gene_tree, options, map_id2location) otus = TreeTools.GetTaxa(gene_tree) if len(otus) <= 1: nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty after filtering - skipped.\n" % gene_tree.name) continue this_species_list = map(extract_species, otus) # check, if only outgroups if options.outgroup_species: if not set(this_species_list).difference(options.outgroup_species): nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: only outgroups after filtering - skipped.\n" % gene_tree.name) continue if options.skip_without_outgroups and not set( this_species_list).intersection(options.outgroup_species): nskipped_outgroups += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: no outgroups - skipped.\n" % gene_tree.name) continue ####################################################################### ####################################################################### ####################################################################### # reroot gene tree, if outgroups have been given. ####################################################################### if options.outgroup_species: TreeReconciliation.rerootTree(gene_tree, extract_species, options) ####################################################################### ####################################################################### ####################################################################### # compute distance to root for each node ####################################################################### distance_to_root = TreeTools.GetDistanceToRoot(gene_tree) ####################################################################### ####################################################################### ####################################################################### # compute counts ####################################################################### # heights per tree heights_per_tree = [] # relative heights per tree relheights_per_tree = [] # distance to root heights_per_species = {} # distance to root (relative to maximum distance to root) relheights_per_species = {} analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets( gene_tree, extract_quality, options) if len(analysis_set) == 0: if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty analysis set - skipped.\n" % gene_tree.name) nskipped += 1 continue reference_height = TreeReconciliation.getReferenceHeight( distance_to_root, gene_tree, gene_set, options, extract_species, method="median") if reference_height is None: if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference height not computable or 0 - skipped.\n" % gene_tree.name) nskipped += 1 continue for node_id in analysis_set: node = gene_tree.node(node_id) species = extract_species(node.data.taxon) height = distance_to_root[node_id] if height < options.warning_small_branch_length: options.stdlog.write( "# tree %s: small distance %s to root at node %i: %s\n" % (gene_tree.name, options.format_branch_length % height, node_id, node.data.taxon)) relheight = height / reference_height try: heights_per_species[species].append(height) except KeyError: heights_per_species[species] = [height] relheights_per_species[species] = [] relheights_per_species[species].append(relheight) # do not use outgroup species if options.outgroup_species and species in options.outgroup_species: continue heights_per_tree.append(height) relheights_per_tree.append(relheight) if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference_height=%s\n" % (gene_tree.name, options.format_branch_length % reference_height)) options.stdlog.flush() if options.print_subtotals: printCounts(heights_per_species, relheights_per_species, heights_per_tree, relheights_per_tree, options, prefix_header, prefix_row) ####################################################################### ####################################################################### ####################################################################### # update total counts ####################################################################### TreeReconciliation.appendCounts(total_heights_per_species, heights_per_species) TreeReconciliation.appendCounts(total_relheights_per_species, relheights_per_species) TreeReconciliation.appendCounts(total_heights_per_tree, heights_per_tree) TreeReconciliation.appendCounts(total_relheights_per_tree, relheights_per_tree) noutput += 1 if options.print_totals: if options.prefix: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + "total" + "\t" prefix_prefix = options.prefix + "_" + "total" + "_" prefix_name = options.prefix + "_" + "total" else: prefix_header = "prefix\t" prefix_row = "total" + "\t" prefix_prefix = "total" + "_" prefix_name = "total" printCounts(total_heights_per_species, total_relheights_per_species, total_heights_per_tree, total_relheights_per_tree, options, prefix_header, prefix_row) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n" % (ninput, nfiltered, nskipped, nskipped_filter, nskipped_outgroups, noutput)) E.Stop()
def run(self): self.prepareRun() if not self.mProgram: raise UsageError("no program specified.") s = subprocess.Popen("%s" % (self.mProgram), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.mTempdir, close_fds=True) (out, err) = s.communicate("\n".join(self.mOptions) + "\n") if s.returncode != 0: raise UsageError, "Error in running phylip.\n%s\n%s\nTemporary directory was %s" % ( out, err, self.mTempdir) # Parse output files that might have been created: result = PhylipResult() # parse tree file if os.path.exists("%s/outtree" % self.mTempdir): nexus = TreeTools.Newick2Nexus( open("%s/outtree" % self.mTempdir, "r")) for tree in nexus.trees: TreeTools.MapTaxa(tree, self.mMapPhylip2Input) result.mNexus = nexus if self.mLogLevel >= 1: print "# received tree with %i taxa" % (len( TreeTools.GetTaxa(nexus.trees[0]))) elif os.path.exists("%s/outfile" % self.mTempdir): if self.mProgram in ("dnadist", "protdist"): infile = open("%s/outfile" % self.mTempdir, "r") result.mMatrix, row_headers, col_headers = MatlabTools.readMatrix( infile, format="phylip") result.mRowHeaders = [] for x in row_headers: result.mRowHeaders.append(self.mMapPhylip2Input[x]) result.mColHeaders = result.mRowHeaders elif self.mProgram == "contrast": infile = open("%s/outfile" % self.mTempdir, "r") result.parseContrasts(infile) infile.close() else: raise "other return types not implemented" if self.mLogLevel >= 2: print out if self.mLogLevel == 0: shutil.rmtree(self.mTempdir) return result
def prepareRun(self): self.__reset() self.mTempdir = tempfile.mkdtemp() # self.mTempdir = "tmp" if not os.path.exists(self.mTempdir): os.mkdir(self.mTempdir) if self.mInputMatrix and self.mInputData: raise ValueError( "please specify either input matrix or input data, but not both." ) # prepare input matrix. Should already be in phylip like # format, but long identifiers are shortened and tabs are # replaced by spaces. if self.mInputMatrix: outfile = open(self.mTempdir + "/infile", "w") identifiers = map(lambda x: re.split("\s+", x[:-1])[0], self.mInputMatrix[1:]) self.updateMaps(identifiers) outfile.write(self.mInputMatrix[0]) for line in self.mInputMatrix[1:]: data = re.split("\s+", line[:-1]) new_line = self.mMapInput2Phylip[ data[0]] + " " + " ".join(data[1:]) outfile.write(new_line + "\n") outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") elif self.mInputData: outfile = open(self.mTempdir + "/infile", "w") outfile.write("%i %i\n" % (len(self.mInputData), len(self.mInputData[0]) - 1)) identifiers = map(lambda x: x[0], self.mInputData) self.updateMaps(identifiers) for x in range(len(identifiers)): outfile.write("%-10s %s\n" % (self.mMapInput2Phylip[identifiers[x]], " ".join( self.mInputData[x][1:]))) outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") # prepare input tree or trees self.mNInputTrees = 0 if self.mInputTree or self.mInputTrees: outfile = open(self.mTempdir + "/intree", "w") if self.mInputTree and self.mInputTrees: raise UsageError( "please supply either one or mupltiple trees, but not both." ) if self.mInputTree: trees = [self.mInputTree] else: trees = self.mInputTrees for tree in trees: if self.mPruneTree: taxa = self.mMapInput2Phylip.keys() TreeTools.PruneTree(tree, taxa) taxa = TreeTools.GetTaxa(tree) self.updateMaps(taxa) TreeTools.MapTaxa(tree, self.mMapInput2Phylip) # check if taxa are unique taxa = tree.get_taxa() staxa = set() skip = False for t in taxa: if t in staxa: if self.mLogLevel >= 1: print "# skipping tree %s because of duplicate taxa." % ( tree.name) skip = True staxa.add(t) if skip: continue outfile.write(TreeTools.Tree2Newick(tree) + "\n") self.mNInputTrees += 1 if self.mLogLevel >= 1: print "# written input tree with %i taxa to %s" % (len( TreeTools.GetTaxa(tree)), self.mTempdir + "/intree") print "#", TreeTools.Tree2Newick(tree) outfile.close() # prepare input multiple alignment if self.mInputMali: if self.mInputMatrix: raise "both mali and matrix supplied - infile conflict." outfile = open(self.mTempdir + "/infile", "w") identifiers = self.mInputMali.getIdentifiers() self.updateMaps(identifiers) self.mInputMali.mapIdentifiers(self.mMapInput2Phylip) self.mInputMali.writeToFile(outfile, format="phylip") outfile.close() if self.mLogLevel >= 1: print "# written input multiple alignments with %i taxa and with %i to %s" %\ (self.mInputMali.getLength(), self.mInputMali.getWidth(), self.mTempdir + "/intree")