def run(self, grammar, tree=None, dump=0, test=False, options={}): self.mTempdir = tempfile.mkdtemp() self.mFilenameGrammar = "grammar.eg" self.mFilenameTree = "tree.nh" self.mFilenameOutput = None self.mWarnings = [] if test: print "# temporary directory is %s" % self.mTempdir outfile = open(self.mTempdir + "/" + self.mFilenameGrammar, "w") outfile.write(grammar.getGrammar()) outfile.close() if tree: outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") ## check what kind of tree is given. if type(tree) == StringType: t = tree.strip() if t[0] == "(" and t[-1] in ");": outfile.write("%s\n" % t) else: nexus = TreeTools.Newick2Nexus(open(tree, "r")) t = nexus.trees[0] outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close() # use your own random seed. Time won't do, if simgram # is called in quick succession. # Are there any restrictions on seeds? Ian using an even number. statement = "%s -rndseed %i -g %s -t %s" % ( self.mExecutable, random.randint( 0, 4294967296), self.mFilenameGrammar, self.mFilenameTree) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.mTempdir, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise UsageError, "Error in running %s \n%s\n%s\nTemporary directory in %s" % ( self.mExecutable, err, out, self.mTempdir) if dump: print "# stdout output of %s:\n%s\n######################################" % ( self.mExecutable, out) if not test: shutil.rmtree(self.mTempdir) return self.parseOutput(out.split("\n"))
def WriteTree(self, tree): """write tree to file. """ nexus = TreeTools.Newick2Nexus(tree) t = nexus.trees[0] TreeTools.MapTaxa(t, self.mMapOld2New) outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") outfile.write("%i 1\n" % self.mNumSequences) outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close()
def getBestTree(trees, method="select-largest"): """select best tree out of a set of trees.""" if method == "select-largest": sizes = zip(map(lambda x: len(x.get_taxa()), trees), range(len(trees))) sizes.sort() best_tree = sizes[-1][1] if options.loglevel >= 3: for x in range(len(trees)): if x == best_tree: continue options.stdlog.write( "# skipped tree: %s: %s\n" % (trees[x].name, TreeTools.Tree2Newick(trees[x]))) return trees[best_tree]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus", "select-largest"), help="aggregation function.") parser.add_option("-r", "--regex-id", dest="regex_id", type="string", help="regex pattern to extract identifier from tree name for the selection functions.") parser.add_option("-w", "--write-values", dest="write_values", type="string", help="if processing multiple trees, write values to file.") parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float", help="set branch length without counts to this value.") parser.set_defaults( method="mean", regex_id=None, filtered_branch_lengths=(-999.0, 999.0), write_values = None, error_branchlength = None, separator=":", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.loglevel >= 2: options.stdlog.write("# reading trees from stdin.\n") options.stdlog.flush() nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write( "# read %i trees from stdin.\n" % len(nexus.trees)) nskipped = 0 ninput = len(nexus.trees) noutput = 0 nerrors = 0 if options.method == "non-redundant": # compute non-redudant trees template_trees = [] template_counts = [] ntree = 0 for tree in nexus.trees: for x in range(0, len(template_trees)): is_compatible, reason = TreeTools.IsCompatible( tree, template_trees[x]) if is_compatible: template_counts[x] += 1 break else: template_counts.append(1) template_trees.append(tree) if options.loglevel >= 2: options.stdlog.write( "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees))) ntree += 1 for x in range(0, len(template_trees)): if options.loglevel >= 1: options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" % (x, template_counts[x], template_counts[x] * 100.0 / ntotal)) options.stdout.write( TreeTools.Tree2Newick(template_trees[x]) + "\n") elif options.method in ("select-largest",): # select one of the trees with the same name. clusters = {} for x in range(0, len(nexus.trees)): n = nexus.trees[x].name if options.regex_id: n = re.search(options.regex_id, n).groups()[0] if n not in clusters: clusters[n] = [] clusters[n].append(x) new_trees = [] for name, cluster in clusters.items(): new_trees.append( getBestTree([nexus.trees[x] for x in cluster], options.method)) for x in range(0, len(new_trees)): options.stdout.write(">%s\n" % new_trees[x].name) options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n") noutput += 1 nskipped = ntotal - noutput elif options.method == "consensus": phylip = WrapperPhylip.Phylip() phylip.setLogLevel(options.loglevel - 2) phylip.setProgram("consense") phylip_options = [] phylip_options.append("Y") phylip.setOptions(phylip_options) phylip.setTrees(nexus.trees) result = phylip.run() options.stdout.write( "# consensus tree built from %i trees\n" % (phylip.mNInputTrees)) options.stdout.write( TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n") noutput = 1 else: if options.method in ("min", "max", "sum", "mean", "counts"): xtree = nexus.trees[0] for n in xtree.chain.keys(): if xtree.node(n).data.branchlength in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 0 ntotals = [1] * len(xtree.chain.keys()) if options.method == "min": f = min elif options.method == "max": f = max elif options.method == "sum": f = lambda x, y: x + y elif options.method == "mean": f = lambda x, y: x + y elif options.method == "counts": f = lambda x, y: x + 1 for n in xtree.chain.keys(): if xtree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 1 else: xtree.node(n).data.branchlength = 0 else: raise "unknown option %s" % options.method for tree in nexus.trees[1:]: for n in tree.chain.keys(): if tree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = f( xtree.node(n).data.branchlength, tree.node(n).data.branchlength) ntotals[n] += 1 if options.method == "mean": for n in xtree.chain.keys(): if ntotals[n] > 0: xtree.node(n).data.branchlength = float( xtree.node(n).data.branchlength) / ntotals[n] else: if options.error_branchlength is not None: xtree.node( n).data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n else: # collect all values for trees values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))] for tree in nexus.trees: for n, node in tree.chain.items(): if node.data.branchlength not in options.filtered_branch_lengths: values[n].append(node.data.branchlength) tree = nexus.trees[0] for n, node in tree.chain.items(): if len(values[n]) > 0: if options.method == "stddev": node.data.branchlength = scipy.std(values[n]) elif options.method == "median": node.data.branchlength = scipy.median(values[n]) else: if options.error_branchlength is not None: node.data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n if options.write_values: outfile = open(options.write_values, "w") for n, node in tree.chain.items(): values[n].sort() id = options.separator.join( sorted(TreeTools.GetLeaves(tree, n))) outfile.write("%s\t%s\n" % (id, ";".join(map(str, values[n])))) outfile.close() del nexus.trees[1:] options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n") noutput = 1 if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % ( ninput, nskipped, noutput, nerrors)) E.Stop()
if options.prefix: prefix_tree = ">%s\n" % options.prefix prefix_header = "prefix\t" prefix_row = "%s\t" % options.prefix else: prefix_tree = "" prefix_header = "" prefix_row = "" for method in options.methods: if method == "write-ks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKs) + "\n") elif method == "write-ka-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKa) + "\n") elif method == "write-kaks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKaks) + "\n") elif method == "lrt":
def __str__(self): return TreeTools.Tree2Newick(self.mTree)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"]) parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string", help="filename of map to output." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "split"), help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default" ) parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string", help="filename pattern for output multiple alignment files." ) parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float", help="remove terminal branches with a branch length larger than this." ) parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-min-length", dest="filter_min_length", type="float", help="remove terminal branches with a branch length smaller than this." ) parser.add_option("--filter-max-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append", help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order." ) parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string", help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list." ) parser.add_option("--min-support", dest="min_support", type="float", help="for monophyly filtering, only accept trees with minimum support." ) parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", help="filter by number of taxa." ) parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", help="filter for trees for simple orhtologs. This works by counting the number of taxa." ) parser.add_option("--filter", dest="filter", type="choice", choices=("taxa", "trees"), help="filter removes taxa or whole trees." ) parser.set_defaults( output_pattern="%s.tree", output_filename_map = None, filter_terminal_max_length = None, filter_terminal_min_length = None, filter_max_length = None, filter_min_length = None, method ="split", filter = "taxa", filtered_branch_length = -999, filter_by_trees = [], filter_by_monophyly = None, filter_ntaxa = None, filter_simple_orthologs = None, min_support = 0.0, regex_species = ("^([^|]+)" ), ) (options, args) = E.Start( parser ) nexus = TreeTools.Newick2Nexus( sys.stdin ) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ninput, noutput, nskipped = 0, 0, 0 ndiscarded = 0 ndiscarded_taxa = 0 ndiscarded_branches = 0 extract_species = lambda x: re.search( options.regex_species, x).groups()[0] if options.filter_by_trees: nexus_filter = [] nexus_maps = [] for filename in options.filter_by_trees: nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) ) trees = nexus_filter[-1].trees if options.loglevel >=1 : options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename)) nexus_map = {} for x in range( len(trees)): nexus_map[trees[x].name] = x nexus_maps.append( nexus_map ) if options.filter_by_monophyly: monophyly_taxa = options.filter_by_monophyly.split(",") if len(monophyly_taxa) == 0: raise "please supply at least two taxa for the monophyly test." if options.output_filename_map: outfile_map = open(options.output_filename_map, "a" ) else: outfile_map = None for tree in nexus.trees: ninput += 1 id = tree.name has_discarded = False if options.filter_ntaxa != None: ntaxa = len(tree.get_terminals()) if ntaxa != options.filter_ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \ (id, ntaxa ) ) has_discarded = True if options.filter_simple_orthologs: ntaxa = len(tree.get_terminals()) nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() ))) if nspecies != ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \ (id, ntaxa, nspecies ) ) has_discarded = True if options.filter_terminal_max_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength >= options.filter_terminal_max_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_terminal_min_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength <= options.filter_terminal_min_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_max_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength >= options.filter_max_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \ (id, x, tree.name, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_min_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength <= options.filter_min_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \ (id, x, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_by_trees: found = [] for y in range(len(nexus_maps)): if id in nexus_maps[y]: found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) ) if not found: ndiscarded += 1 continue for x in tree.get_nodes(tree.root): if x == tree.root: continue for y, other_tree in found: other_node = other_tree.node( x ) if other_node.data.branchlength == options.filtered_branch_length: node = tree.node(x) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \ (id, x, y, other_tree.name) ) node.data.branchlength = options.filtered_branch_length has_discarded = True ndiscarded_branches += 1 break if options.filter_by_monophyly: terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals())) for t in monophyly_taxa: if t not in terminals: if options.loglevel >= 2: options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name)) nskipped += 1 succ = tree.node(tree.root).succ ## use minimum support at root, if it is not the same (if trees ## are rooted) if len(succ) == 2: m = min( map( lambda x: tree.node(x).data.support, succ) ) for x in succ: tree.node(x).data.support = m if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ): ndiscarded += 1 continue if has_discarded: ndiscarded += 1 if options.filter=="trees" or options.filter_ntaxa: continue if options.method == "split": output_filename = re.sub( "%s", id, options.output_pattern ) dirname = os.path.dirname(output_filename) if dirname and not os.path.exists( dirname ): os.makedirs( dirname ) if not os.path.exists( output_filename ): outfile = open(output_filename, "w" ) outfile.write( TreeTools.Tree2Newick( tree ) + "\n" ) noutput += 1 else: if options.loglevel >= 1: options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename)) nskipped += 1 continue elif options.method == "filter": options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) ) noutput += 1 if outfile_map: for t in TreeTools.GetTaxa( tree ): outfile_map.write( "%s\t%s\n" % (t, id) ) if outfile_map: outfile_map.close() if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\ (ninput, noutput, nskipped, ndiscarded, ndiscarded_taxa, ndiscarded_branches)) E.Stop()
continue if options.prefix: prefix_tree = ">%s\n" % options.prefix prefix_header = "prefix\t" prefix_row = "%s\t" % options.prefix else: prefix_tree = "" prefix_header = "" prefix_row = "" for method in options.methods: if method == "write-ks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKs) + "\n" ) elif method == "write-ka-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKa) + "\n" ) elif method == "write-kaks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKaks) + "\n" ) elif method == "lrt": ## perform log-likelihood ratio test between successive models ## Assumption is that the models are nested with the previous model ## being the less complex model. first_result = results[0] last_result = results[0]
def prepareRun(self): self.__reset() self.mTempdir = tempfile.mkdtemp() # self.mTempdir = "tmp" if not os.path.exists(self.mTempdir): os.mkdir(self.mTempdir) if self.mInputMatrix and self.mInputData: raise ValueError( "please specify either input matrix or input data, but not both." ) # prepare input matrix. Should already be in phylip like # format, but long identifiers are shortened and tabs are # replaced by spaces. if self.mInputMatrix: outfile = open(self.mTempdir + "/infile", "w") identifiers = map(lambda x: re.split("\s+", x[:-1])[0], self.mInputMatrix[1:]) self.updateMaps(identifiers) outfile.write(self.mInputMatrix[0]) for line in self.mInputMatrix[1:]: data = re.split("\s+", line[:-1]) new_line = self.mMapInput2Phylip[ data[0]] + " " + " ".join(data[1:]) outfile.write(new_line + "\n") outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") elif self.mInputData: outfile = open(self.mTempdir + "/infile", "w") outfile.write("%i %i\n" % (len(self.mInputData), len(self.mInputData[0]) - 1)) identifiers = map(lambda x: x[0], self.mInputData) self.updateMaps(identifiers) for x in range(len(identifiers)): outfile.write("%-10s %s\n" % (self.mMapInput2Phylip[identifiers[x]], " ".join( self.mInputData[x][1:]))) outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") # prepare input tree or trees self.mNInputTrees = 0 if self.mInputTree or self.mInputTrees: outfile = open(self.mTempdir + "/intree", "w") if self.mInputTree and self.mInputTrees: raise UsageError( "please supply either one or mupltiple trees, but not both." ) if self.mInputTree: trees = [self.mInputTree] else: trees = self.mInputTrees for tree in trees: if self.mPruneTree: taxa = self.mMapInput2Phylip.keys() TreeTools.PruneTree(tree, taxa) taxa = TreeTools.GetTaxa(tree) self.updateMaps(taxa) TreeTools.MapTaxa(tree, self.mMapInput2Phylip) # check if taxa are unique taxa = tree.get_taxa() staxa = set() skip = False for t in taxa: if t in staxa: if self.mLogLevel >= 1: print "# skipping tree %s because of duplicate taxa." % ( tree.name) skip = True staxa.add(t) if skip: continue outfile.write(TreeTools.Tree2Newick(tree) + "\n") self.mNInputTrees += 1 if self.mLogLevel >= 1: print "# written input tree with %i taxa to %s" % (len( TreeTools.GetTaxa(tree)), self.mTempdir + "/intree") print "#", TreeTools.Tree2Newick(tree) outfile.close() # prepare input multiple alignment if self.mInputMali: if self.mInputMatrix: raise "both mali and matrix supplied - infile conflict." outfile = open(self.mTempdir + "/infile", "w") identifiers = self.mInputMali.getIdentifiers() self.updateMaps(identifiers) self.mInputMali.mapIdentifiers(self.mMapInput2Phylip) self.mInputMali.writeToFile(outfile, format="phylip") outfile.close() if self.mLogLevel >= 1: print "# written input multiple alignments with %i taxa and with %i to %s" %\ (self.mInputMali.getLength(), self.mInputMali.getWidth(), self.mTempdir + "/intree")