def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/evaluate_trees.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-r", "--reference=", dest="filename_reference_tree", help="filename with reference tree.", type="string") parser.set_defaults(filename_reference_tree=None) (options, args) = E.Start(parser) if not options.filename_reference_tree: print "please supply reference tree." if options.loglevel >= 1: print "# reading reference tree." nexus = TreeTools.Newick2Nexus(open(options.filename_reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 1: print "# reading sample trees." nexus2 = TreeTools.Newick2Nexus(sys.stdin) ntotal, nok, nfailed = 0, 0, 0 ntopology, ntaxa, nleaves = 0, 0, 0 for t in nexus2.trees: ntotal += 1 is_ok, reason = TreeTools.IsCompatible(reference_tree, t) if is_ok: nok += 1 else: nfailed += 1 if reason == "topology": ntopology += 1 elif reason == "taxa": ntaxa += 1 elif reason == "leaves": nleaves += 1 print "# total=%i, compatible=%i, failed=%i, topology=%i, taxa=%i, leaves=%i" %\ (ntotal, nok, nfailed, ntopology, ntaxa, nleaves) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2plot.py 2782 2009-09-10 11:40:29Z andreas $") parser.set_defaults() (options, args) = E.Start(parser, add_pipe_options=True) lines = filter(lambda x: x[0] != "#", sys.stdin.readlines()) nexus = TreeTools.Newick2Nexus(lines) input_tree = nexus.trees[0] treegraph = TreeGraph(support=None, loglevel=options.loglevel) print treegraph.Run(input_tree) E.Stop()
def run(self, grammar, tree=None, dump=0, test=False, options={}): self.mTempdir = tempfile.mkdtemp() self.mFilenameGrammar = "grammar.eg" self.mFilenameTree = "tree.nh" self.mFilenameOutput = None self.mWarnings = [] if test: print "# temporary directory is %s" % self.mTempdir outfile = open(self.mTempdir + "/" + self.mFilenameGrammar, "w") outfile.write(grammar.getGrammar()) outfile.close() if tree: outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") ## check what kind of tree is given. if type(tree) == StringType: t = tree.strip() if t[0] == "(" and t[-1] in ");": outfile.write("%s\n" % t) else: nexus = TreeTools.Newick2Nexus(open(tree, "r")) t = nexus.trees[0] outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close() # use your own random seed. Time won't do, if simgram # is called in quick succession. # Are there any restrictions on seeds? Ian using an even number. statement = "%s -rndseed %i -g %s -t %s" % ( self.mExecutable, random.randint( 0, 4294967296), self.mFilenameGrammar, self.mFilenameTree) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.mTempdir, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise UsageError, "Error in running %s \n%s\n%s\nTemporary directory in %s" % ( self.mExecutable, err, out, self.mTempdir) if dump: print "# stdout output of %s:\n%s\n######################################" % ( self.mExecutable, out) if not test: shutil.rmtree(self.mTempdir) return self.parseOutput(out.split("\n"))
def trainMali( mali, options ): """train a grammar on a multiple alignment.""" ## remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=1 ) length = mali.getNumColumns() input_model = prepareGrammar( options ) for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] try: tree.relabel( map_old2new, warn = True ) except KeyError, msg: raise KeyError( "names in mali and tree are not congruent: %s" % msg )
def processChunk(lines, map_strain2species, options): nexus = TreeTools.Newick2Nexus(lines) global ninput, noutput, nskipped, nmerged for tree in nexus.trees: ninput += 1 if options.loglevel >= 3: tree.display() mergers = getSpeciesTreeMergers(tree, map_strain2species, options) if options.loglevel >= 3: options.stdlog.write( "# found %i nodes in the tree that will be merged.\n" % (len(mergers))) if len(mergers) > 0: nmerged += 1 n = applySpeciesTreeMergers( tree, mergers, map_strain2species, options) if len(tree.get_terminals()) <= 1: nskipped += 1 continue tree.writeToFile(options.stdout, format=options.output_format) noutput += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--skip-trees", dest="skip_trees", action="store_true", help="do not output tree names in third field [default=%default].") parser.set_defaults(skip_trees=False) (options, args) = E.Start(parser, add_pipe_options=True) nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ntree = 0 ntotal = len(nexus.trees) if ntotal == 1: options.stdout.write("taxon\n") else: if options.skip_trees: options.stdout.write("taxon\ttree\n") else: options.stdout.write("taxon\ttree\tname\n") for tree in nexus.trees: ntree += 1 taxa = TreeTools.GetTaxa(tree) if ntotal == 1: for t in taxa: options.stdout.write("%s\n" % (t)) elif options.skip_trees: for t in taxa: options.stdout.write("%s\t%i\n" % (t, ntree)) else: for t in taxa: options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name)) if options.loglevel >= 1: options.stdlog.write("# ntotal=%i\n" % (ntotal)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-s", "--sort-order", dest="sort_order", type="string", help="output order of OTU.") parser.set_defaults( reference_tree=None, sort_order=[], ) (options, args) = E.Start(parser) if not options.sort_order: for nx in reference_tree.get_terminals(): options.sort_order.append(reference_tree.node(nx).get_data().taxon) else: options.sort_order = options.sort_order.split(",") if not options.reference_tree: raise "no reference tree defined." nexus = TreeTools.Newick2Nexus(options.reference_tree) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order) for p in patterns: print p E.Stop()
def WriteTree(self, tree): """write tree to file. """ nexus = TreeTools.Newick2Nexus(tree) t = nexus.trees[0] TreeTools.MapTaxa(t, self.mMapOld2New) outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") outfile.write("%i 1\n" % self.mNumSequences) outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close()
def ParseTree(reference_tree, rx_species): nexus = TreeTools.Newick2Nexus(reference_tree) reference_tree = nexus.trees[0] if param_loglevel >= 3: print "# reference tree:" reference_tree.display() map_taxon2id = {} for nx in reference_tree.get_terminals(): otu = reference_tree.node(nx).get_data().taxon map_taxon2id[otu] = len(map_taxon2id) if param_loglevel >= 2: print "# %s\t%i" % (otu, map_taxon2id[otu]) map_taxon2id["unknown"] = len(map_taxon2id) return reference_tree, map_taxon2id
def testGetMergers(self): """ test. TODO: add testing for transcripts """ print "testGetMergers()" for lines, reference, map_strain2species, options in self.mTestData: nexus = TreeTools.Newick2Nexus(lines) mergers = tree_strain2species.getMergers( nexus.trees[0], map_strain2species, options) for node_id, species, strain_x, gene_x, strain_y, gene_y in mergers: key1 = ((strain_x, gene_x), (strain_y, gene_y)) key2 = ((strain_y, gene_y), (strain_x, gene_x)) if key1 not in reference and key2 not in reference: self.fail("%s not in reference %s" % (str(key1), str(reference)))
def parseOutput(self, lines, out, err): lines = re.sub("\s", "", "".join(lines)) lines = re.sub("\[[^\]]+\]", "", lines) t = TreeTools.Newick2Nexus("".join(lines)) result = Result() t = t.trees[0] TreeTools.MapTaxa(t, self.mMapNew2Old) result.mTree = t result.mLog = out result.mErr = err return result
def processChunk(lines, map_strain2species, options): nexus = TreeTools.Newick2Nexus(lines) global ninput, noutput, nskipped, nmerged for tree in nexus.trees: ninput += 1 if options.loglevel >= 3: tree.display() mergers = getMergers(tree, map_strain2species, options) if options.loglevel >= 3: options.stdlog.write( "# found %i pairs of genes that will be merged.\n" % (len(mergers))) if len(mergers) > 0: nmerged += 1 n = applyMergers(tree, mergers, counters, map_strain2species, options) if len(tree.get_terminals()) <= 1: nskipped += 1 continue for new_name, values in n.items(): for strain, gene in values: if (strain, gene) in merged: options.stdlog.write( "# warning: strain %s and gene %s already appeared in tree %s" % (merged[(strain, gene)])) nwarnings += 1 merged[(strain, gene)] = None output_genes.write("%s\t%s\n" % (options.separator.join( (strain, gene)), new_name)) tree.writeToFile(options.stdout, format=options.output_format) noutput += 1
def GetPrunedReferenceTree( mask, present_orgs, reference_tree ): # reread and process species tree # has to be done for every new pass, because # the tree is modified later on (and I haven't found # a copy mechanism (because I did not look)). nexus = TreeTools.Newick2Nexus( reference_tree ) reference_tree = nexus.trees[0] ########################################################################### # prune reference tree and keep only those taxa, which are present in the cluster. for nx in reference_tree.get_terminals(): otu = reference_tree.node(nx).get_data().taxon if otu not in present_orgs: Prune( reference_tree, otu ) if param_loglevel >= 3: print "# pruned reference tree for %s:" % (",".join(present_orgs.keys())) reference_tree.display() return reference_tree
def processMali(mali, options): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int(float(ncols) / 3.0 * options.block_size) * 3 else: size = int(options.block_size) * 3 size = min(size, ncols) mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename(id, species) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps(minimum_gaps=1, frame=3) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r")) tree = nexus.trees[0] tree.relabel(map_old2new) else: tree = None annotation = mali.getAnnotation("STATE") chars = set(list(annotation)) for c in chars: assert c in ( "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block") blocks = (("B0_", chars[0]), ) else: blocks = (("B0_", "N"), ("B1_", "C")) result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks, options) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix(trained_model) annotation = mali.getAnnotation("STATE") for block, code in blocks: terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs) rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn) ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri) rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv) nchars = annotation.count(code) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv) try: Q, t = RateEstimation.getQMatrix(pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=ri * avg_omega, Rsv=rv * avg_omega, Rni=ri * avg_omega, Rnv=rv * avg_omega) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=rs * avg_kappa, Rsv=rs * avg_kappa, Rni=rn * avg_kappa, Rnv=rn * avg_kappa) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % (rI / rI0 * rV0 / rV) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write("\t".join( map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars)))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) options.stdout.write("\t%s\n" % msg)
colour_by_species=None, tree=None, branch_scale=0, height_scale=0, ) (options, args) = Experiment.Start(parser, add_pipe_options=True) if options.filename_tree: tree_lines = open(options.filename_tree, "r").readlines() elif options.tree: tree_lines = options.tree else: raise "please supply a species tree." nexus = TreeTools.Newick2Nexus(tree_lines) Tree.updateNexus(nexus) tree = nexus.trees[0] if options.loglevel >= 2: tree.display() plot = SVGTree(tree) plot.setBranchScale(options.branch_scale) plot.setHeightScale(options.height_scale) if options.colour_by_species: rx = re.compile(options.species_regex) extract_species = lambda x: rx.search(x).groups()[0] plot.setDecoratorExternalNodes(
def processMali(mali, options): map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() invalid_chars = options.gap_chars + options.mask_chars has_non_overlaps = False pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids)): for y in range(0, x): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) elif options.iteration == "tree": pairs = [] else: raise "unknown iteration mode: %s" % (options.iteration) if options.remove_stops: for id, entry in mali.items(): s = entry.mString.upper() fragments = [] for x in range(0, len(s), 3): codon = s[x:x + 3] if Genomics.IsStopCodon(codon): codon = "NNN" fragments.append(codon) entry.mString = "".join(fragments) for x, y in pairs: noverlap = 0 for a, b in zip(mali[ids[x]], mali[ids[y]]): if a not in invalid_chars and b not in invalid_chars: noverlap += 1 if noverlap >= options.min_overlap: break else: has_non_overlaps = True break if options.tree: tree = TreeTools.Newick2Nexus(options.tree).trees[0] map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) tree.relabel(map_old2new) else: tree = None if options.method == "paml": runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options) elif options.method == "xrate": runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not possible to use # cor.test or lsfit directly, as you have to perform a # regression through the origin. # uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join(map(lambda x: options.value_format % x, d)) + "\n ") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data( node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data(node_id, node.data.branchlength, c1, c2) else: assert (node_id == tree.root) assert (len(node.succ) == 3) update_data(node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data(max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write("node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus", "select-largest"), help="aggregation function.") parser.add_option("-r", "--regex-id", dest="regex_id", type="string", help="regex pattern to extract identifier from tree name for the selection functions.") parser.add_option("-w", "--write-values", dest="write_values", type="string", help="if processing multiple trees, write values to file.") parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float", help="set branch length without counts to this value.") parser.set_defaults( method="mean", regex_id=None, filtered_branch_lengths=(-999.0, 999.0), write_values = None, error_branchlength = None, separator=":", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.loglevel >= 2: options.stdlog.write("# reading trees from stdin.\n") options.stdlog.flush() nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write( "# read %i trees from stdin.\n" % len(nexus.trees)) nskipped = 0 ninput = len(nexus.trees) noutput = 0 nerrors = 0 if options.method == "non-redundant": # compute non-redudant trees template_trees = [] template_counts = [] ntree = 0 for tree in nexus.trees: for x in range(0, len(template_trees)): is_compatible, reason = TreeTools.IsCompatible( tree, template_trees[x]) if is_compatible: template_counts[x] += 1 break else: template_counts.append(1) template_trees.append(tree) if options.loglevel >= 2: options.stdlog.write( "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees))) ntree += 1 for x in range(0, len(template_trees)): if options.loglevel >= 1: options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" % (x, template_counts[x], template_counts[x] * 100.0 / ntotal)) options.stdout.write( TreeTools.Tree2Newick(template_trees[x]) + "\n") elif options.method in ("select-largest",): # select one of the trees with the same name. clusters = {} for x in range(0, len(nexus.trees)): n = nexus.trees[x].name if options.regex_id: n = re.search(options.regex_id, n).groups()[0] if n not in clusters: clusters[n] = [] clusters[n].append(x) new_trees = [] for name, cluster in clusters.items(): new_trees.append( getBestTree([nexus.trees[x] for x in cluster], options.method)) for x in range(0, len(new_trees)): options.stdout.write(">%s\n" % new_trees[x].name) options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n") noutput += 1 nskipped = ntotal - noutput elif options.method == "consensus": phylip = WrapperPhylip.Phylip() phylip.setLogLevel(options.loglevel - 2) phylip.setProgram("consense") phylip_options = [] phylip_options.append("Y") phylip.setOptions(phylip_options) phylip.setTrees(nexus.trees) result = phylip.run() options.stdout.write( "# consensus tree built from %i trees\n" % (phylip.mNInputTrees)) options.stdout.write( TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n") noutput = 1 else: if options.method in ("min", "max", "sum", "mean", "counts"): xtree = nexus.trees[0] for n in xtree.chain.keys(): if xtree.node(n).data.branchlength in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 0 ntotals = [1] * len(xtree.chain.keys()) if options.method == "min": f = min elif options.method == "max": f = max elif options.method == "sum": f = lambda x, y: x + y elif options.method == "mean": f = lambda x, y: x + y elif options.method == "counts": f = lambda x, y: x + 1 for n in xtree.chain.keys(): if xtree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 1 else: xtree.node(n).data.branchlength = 0 else: raise "unknown option %s" % options.method for tree in nexus.trees[1:]: for n in tree.chain.keys(): if tree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = f( xtree.node(n).data.branchlength, tree.node(n).data.branchlength) ntotals[n] += 1 if options.method == "mean": for n in xtree.chain.keys(): if ntotals[n] > 0: xtree.node(n).data.branchlength = float( xtree.node(n).data.branchlength) / ntotals[n] else: if options.error_branchlength is not None: xtree.node( n).data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n else: # collect all values for trees values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))] for tree in nexus.trees: for n, node in tree.chain.items(): if node.data.branchlength not in options.filtered_branch_lengths: values[n].append(node.data.branchlength) tree = nexus.trees[0] for n, node in tree.chain.items(): if len(values[n]) > 0: if options.method == "stddev": node.data.branchlength = scipy.std(values[n]) elif options.method == "median": node.data.branchlength = scipy.median(values[n]) else: if options.error_branchlength is not None: node.data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n if options.write_values: outfile = open(options.write_values, "w") for n, node in tree.chain.items(): values[n].sort() id = options.separator.join( sorted(TreeTools.GetLeaves(tree, n))) outfile.write("%s\t%s\n" % (id, ";".join(map(str, values[n])))) outfile.close() del nexus.trees[1:] options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n") noutput = 1 if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % ( ninput, nskipped, noutput, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2stats.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("branchlengths", ), help="methods to apply.") parser.set_defaults( methods=[], filtered_branch_length=-999, ) (options, args) = E.Start(parser, add_pipe_options=True) nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ninput = len(nexus.trees) nskipped = 0 for method in options.methods: outfile = options.stdout if method == "branchlengths": outfile.write( "tree\t%s\n" % "\t".join(Stats.DistributionalParameters().getHeaders())) for tree in nexus.trees: branchlengths = [] for node in tree.chain.values(): # ignore branch length of root if it is zero if not node.prev and node.data.branchlength == 0: continue if node.data.branchlength == options.filtered_branch_length: continue branchlengths.append(node.data.branchlength) s = Stats.DistributionalParameters(branchlengths) outfile.write("%s\t%s\n" % (tree.name, str(s))) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, nskipped=%i\n" % (ninput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2tree.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="""invert map.""") parser.add_option("--input-format", dest="input_format", type="choice", choices=("phylip", "full"), help="""input format.""") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="""filename with tree to fit.""") parser.add_option("-m", "--method", dest="method", type="choice", choices=("nj", "kitsch", "fitch"), help="""algorithm to run.""") parser.add_option("-e", "--replicates", dest="replicates", action="store_true", help="replicates.") parser.add_option("-r", "--root", dest="root", action="store_true", help="midpoint root (if it is not rooted).") parser.add_option("-u", "--unroot", dest="unroot", action="store_true", help="unroot tree (if it is rooted).") parser.add_option("--skip-separators", dest="write_separators", action="store_false", help="do not echo separators (starting with >)") # parser.add_option("-i", "--iterations", dest="iterations", type="int", # help="number of iterations." ) parser.add_option("-p", "--power", dest="power", type="float", help="power.") parser.add_option( "--prune-tree", dest="prune_tree", action="store_true", help= "prune tree such to include only taxa which are part of the input matrix." ) parser.add_option( "--add-random", dest="add_random", action="store_true", help="add small random value to off-diagonal zero elements in matrix.") parser.add_option( "--pseudo-replicates", dest="pseudo_replicates", action="store_true", help= "add small random value to off-diagonal zero elements in matrix, even if they have no replicates." ) parser.add_option("--debug", dest="debug", action="store_true", help="dump debug information.") parser.set_defaults( value=0, method="nj", input_format="phylip", filename_tree=None, outgroup=None, replicates=False, root=False, unroot=False, power=0, write_separators=True, prune_tree=False, add_random=False, debug=False, ) (options, args) = E.Start(parser, add_pipe_options=True) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setPruneTree(options.prune_tree) lines = filter(lambda x: x[0] != "#", sys.stdin.readlines()) chunks = filter(lambda x: lines[x][0] == ">", range(len(lines))) if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) for x in range(len(chunks) - 1): matrix = lines[chunks[x] + 1:chunks[x + 1]] # parse phylip matrix if options.add_random: mm = [] ids = [] for l in range(1, len(matrix)): values = re.split("\s+", matrix[l][:-1]) ids.append(values[0]) mm.append(map(lambda x: x.strip(), values[1:])) d = len(mm) if options.replicates: for row in range(d - 1): for col in range(row + 1, d): cc = col * 2 rr = row * 2 if mm[row][cc] == "0" and mm[row][cc + 1] != "0": mm[row][cc + 1] = "1" mm[col][rr + 1] = "1" v = str(random.random() / 10000.0) mm[row][cc] = v mm[col][rr] = v else: for row in range(d - 1): for col in range(row + 1, d): if mm[row][col] == "0": v = str(random.random() / 10000.0) mm[row][col] = v mm[col][row] = v matrix = ["%i\n" % d] for row in range(d): matrix.append(ids[row] + " " + " ".join(mm[row]) + "\n") # parse phylip matrix if options.pseudo_replicates: mm = [] ids = [] for l in range(1, len(matrix)): values = re.split("\s+", matrix[l][:-1]) ids.append(values[0]) mm.append(map(lambda x: x.strip(), values[1:])) d = len(mm) if options.replicates: for row in range(d - 1): for col in range(row + 1, d): cc = col * 2 rr = row * 2 if mm[row][cc + 1] == "0": mm[row][cc + 1] = "1" mm[col][rr + 1] = "1" v = str(random.random() / 10000.0) mm[row][cc] = v mm[col][rr] = v else: mm[row][cc + 1] = "100" mm[col][rr + 1] = "100" else: for row in range(d - 1): for col in range(row + 1, d): if mm[row][col] == "0": v = str(random.random() / 10000.0) mm[row][col] = v mm[col][row] = v matrix = ["%i\n" % d] for row in range(d): matrix.append(ids[row] + " " + " ".join(mm[row]) + "\n") phylip.setMatrix(matrix) phylip_options = [] if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) ref_tree = nexus.trees[0] phylip.setTree(ref_tree) phylip_options.append("U") else: ref_tree = None if options.method == "nj": phylip.setProgram("neighbor") elif options.method == "fitch": phylip.setProgram("fitch") elif options.method == "kitsch": phylip.setProgram("kitsch") if options.replicates: phylip_options.append("S") if options.power > 0: phylip_options.append("P") phylip_options.append("%f" % options.power) phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() # root with outgroup if options.root: if options.outgroup: pass # midpoint root else: for tree in result.mNexus.trees: tree.root_midpoint() # explicitely unroot elif options.unroot: phylip.setOptions(("Y", "W", "U", "Q")) phylip.setProgram("retree") for x in range(len(result.mNexus.trees)): phylip.setTree(result.mNexus.trees[x]) xresult = phylip.run() result.mNexus.trees[x] = xresult.mNexus.trees[0] if options.write_separators: options.stdout.write(lines[chunks[x]]) if result.mNexus: options.stdout.write(TreeTools.Nexus2Newick(result.mNexus) + "\n") if options.loglevel >= 1: if ref_tree: nref = len(ref_tree.get_terminals()) else: nref = 0 for tree in result.mNexus.trees: options.stdlog.write( "# ninput=%i, nreference=%i, noutput=%i\n" % (len(matrix) - 1, nref, len(tree.get_terminals()))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2matrix.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-f", "--format", dest="format", type="string", help="number format to use.") parser.add_option("-g", "--graph", dest="to_graph", action="store_true", help="convert tree(s) to graph(s).") parser.add_option("-a", "--table", dest="to_table", action="store_true", help="convert tree(s) to table.") parser.add_option("-t", "--translate", dest="do_translate", action="store_true", help="translate internal nodes to clades.") parser.add_option( "--output-pattern", dest="output_filename_pattern", type="string", help="pattern for output file if there are multiple trees in the file." "") parser.add_option("--pairs", dest="pairs", type="choice", choices=("all", "leaves", "branches", "terminals", "lineage", "between-species"), help="choose pairs of nodes to output." "") parser.add_option( "--species", dest="species", type="string", help= "comma separated list of species that are considered. All others are ignored." ) parser.set_defaults( format="%6.4f", to_graph=False, to_table=False, do_translation=False, separator=":", do_all_on_all=False, do_branches=False, do_terminals=False, output_filename_pattern=None, pairs="branches", species=None, regex_species=("^([^|]+)"), ) (options, args) = E.Start(parser, add_pipe_options=True) if options.species: options.species = options.species.split(",") nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ntree = 0 outfile = None ## The table is a hash of lists table = {} extract_species = lambda x: re.search(options.regex_species, x).groups()[0] for tree in nexus.trees: if len(nexus.trees) == 1: outfile = options.stdout elif options.output_filename_pattern: ntree += 1 if outfile != None: outfile.close() outfile = open(options.output_filename_pattern % ntree, "w") else: outfile = options.stdout ## prune tree, if an explicit species list is given if options.species: species = set(options.species) terminals = tree.get_terminals() for x in terminals: taxon = tree.node(x).data.taxon if extract_species(taxon) not in species: tree.prune(taxon) ## define node list terminals = tree.get_terminals() set_terminals = set(terminals) node_list = [] if options.pairs == "all": nodes = TreeTools.GetAllNodes(tree) for x in range(len(nodes)): for y in range(0, x): node_list.append((nodes[x], nodes[y])) elif options.pairs == "terminals": for x in terminals: node_list.append((x, tree.node(x).prev)) elif options.pairs == "leaves": nodes = terminals for x in range(len(nodes)): for y in range(0, x): node_list.append((nodes[x], nodes[y])) elif options.pairs == "branches": nodes = TreeTools.GetAllNodes(tree) for x in range(len(nodes)): if tree.node(x).prev: node_list.append((x, tree.node(x).prev)) elif options.pairs == "between-species": nodes = terminals for x in range(len(nodes)): for y in range(0, x): s1 = extract_species(tree.node(nodes[x]).data.taxon) s2 = extract_species(tree.node(nodes[y]).data.taxon) if s1 != s2: node_list.append((nodes[x], nodes[y])) elif options.pairs == "lineage": raise "not implemented." if options.to_graph: outfile.write("node1\tnode2\tdistance\n") links = TreeTools.Tree2Graph(tree) for n1, n2, weight in links: node1 = TranslateNode(n1, tree, set_terminals, options) node2 = TranslateNode(n2, tree, set_terminals, options) if node1 > node2: node1, node2 = node2, node1 outfile.write("%s\t%s\t%s\n" % (node1, node2, options.format % weight)) elif options.to_table: if options.do_all_on_all: nodes = TreeTools.GetAllNodes(tree) else: nodes = terminals for n1, n2 in node_list: node1 = TranslateNode(n1, tree, set_terminals, options) node2 = TranslateNode(n2, tree, set_terminals, options) if node1 > node2: node1, node2 = node2, node1 if options.do_terminals: key = "%s" % node2 else: key = "%s-%s" % (node1, node2) if key not in table: table[key] = [] table[key].append(options.format % tree.distance(n1, n2)) else: outfile.write("node1\tnode2\tdistance\n") for n1, n2 in node_list: node1 = TranslateNode(n1, tree, set_terminals, options) node2 = TranslateNode(n2, tree, set_terminals, options) if node1 > node2: node1, node2 = node2, node1 outfile.write( "%s\t%s\t%s\n" % ( \ node1, node2, options.format % tree.distance( n1, n2 ))) if options.to_table: outfile = sys.stdout outfile.write("branch\t%s\n" % ("\t".join(map(str, range(0, len(nexus.trees)))))) for key, values in table.items(): outfile.write("%s\t%s\n" % (key, "\t".join(values))) if outfile != sys.stdout: outfile.close() E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: plot_tree.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-i", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-s", "--filename-tree", dest="filename_tree", type="string", help="filename with tree.") parser.add_option("-t", "--tree", dest="tree", type="string", help="tree.") parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option("--colour-by-species", dest="colour_by_species", action="store_true", help="colour by species.") parser.add_option("--support-style", dest="support_style", type="choice", choices=("pie", "number"), help="style for support information.") parser.add_option("--error-style", dest="error_style", type="choice", choices=("pie", "number"), help="style for error information.") parser.add_option("--branch-scale", dest="branch_scale", type="float", help="branch length scale factor.") parser.add_option("--height-scale", dest="height_scale", type="float", help="height scale factor.") parser.add_option("-a", "--annotations", dest="annotations", type="choice", action="append", choices=("support", "error", "kaks", "master", "value", "tables"), help="annotations given by further trees.") parser.add_option( "--filename-tables", dest="filename_tables", type="string", help="add tables from file (need also set options -a tables) [%default]" ) parser.add_option("--show-branchlengths", dest="show_branchlengths", action="store_true", help="show branch lengths.") parser.add_option("--leaf-symbol", dest="plot_leaf_symbol", type="choice", choices=("square", "circle"), help="Symbol for leaves.") parser.add_option("--font-size-branches", dest="font_size_branches", type="int", help="set font size for branches.") parser.add_option("--font-size-tips", dest="font_size_tips", type="int", help="set font size for tips.") parser.add_option("--font-style-tips", dest="font_style_tips", type="choice", choices=( "normal", "italic", ), help="set font style for tips.") parser.add_option("--filename-map", dest="filename_map", type="string", help="filename with a name translation table.") parser.add_option("--filename-map-species2colour", dest="filename_colour_map", type="string", help="filename with a map of species to colour.") parser.add_option("--no-leaf-labels", dest="plot_leaf_labels", action="store_false", help="do not show labels at leafs.") parser.add_option("--no-ruler", dest="plot_ruler", action="store_false", help="do not plot ruler.") parser.set_defaults( titles="", title="", footer="", filename_tree=None, species_regex="^([^|]+)\|", colour_by_species=None, tree=None, branch_scale=0, height_scale=0, support_style=None, error_style="number", kaks_style="number", annotations=None, show_branchlengths=False, branch_length_format="%5.2f", font_size_tips=None, font_size_branches=None, font_style_tips=None, filename_map=None, filename_colour_map=None, plot_leaf_labels=True, plot_leaf_symbol=None, plot_ruler=True, filename_tables=None, ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_tree: tree_lines = open(options.filename_tree, "r").readlines() elif options.tree: tree_lines = options.tree else: tree_lines = sys.stdin.readlines() nexus = TreeTools.Newick2Nexus(tree_lines) master_tree = nexus.trees[0] if options.filename_map: map_names = IOTools.ReadMap(open(options.filename_map, "r")) for id, node in master_tree.chain.items(): if node.data.taxon in map_names: node.data.taxon = map_names[node.data.taxon] if options.loglevel >= 2: master_tree.display() plot = SVGTree.SVGTree(master_tree) if options.branch_scale: plot.setBranchScale(options.branch_scale) if options.height_scale != None: plot.setHeightScale(options.height_scale) if options.font_size_tips != None: plot.setFontSize(options.font_size_tips) if options.plot_ruler == False: plot.setRulerElements([]) if options.show_branchlengths: b = SVGTree.BranchDecoratorHorizontalBranchLength(master_tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) plot.setDecoratorHorizontalBranches(b) if options.colour_by_species: if options.filename_colour_map: map_species2colour = IOTools.ReadMap( open(options.filename_colour_map, "r")) else: map_species2colour = None rx = re.compile(options.species_regex) extract_species = lambda x: rx.search(x).groups()[0] plot.setDecoratorExternalNodes( SVGTree.NodeDecoratorBySpecies( master_tree, plot_symbol=options.plot_leaf_symbol, plot_label=options.plot_leaf_labels, map_species2colour=map_species2colour, extract_species=extract_species)) if options.font_style_tips: plot.getDecoratorExternalNodes().setFontStyle(options.font_style_tips) plot.getDecoratorExternalNodes().setPlotLabel(options.plot_leaf_labels) current_tree = 1 ## add annotations by further trees given on the command line branch_length_annotations = [] current_reference_tree = master_tree if options.annotations: for annotation in options.annotations: tree = nexus.trees[current_tree] if annotation == "support": tree.branchlength2support() for id, node in tree.chain.items(): node.data.branchlength = 1.0 if options.support_style == "pie": plot.setDecoratorInternalNodes( NodeDecoratorSupportPieChart( nexus.trees[current_tree])) elif annotation == "error": if options.error_style == "number": b = SVGTree.BranchDecoratorHorizontalBranchLengthError( current_reference_tree, tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "kaks": if options.kaks_style == "number": b = SVGTree.BranchDecoratorHorizontalBranchLengthWithKaks( current_reference_tree, tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "value": b = SVGTree.BranchDecoratorHorizontalBranchLength(tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "master": current_reference_tree = tree elif annotation == "tables": b = BranchDecoratorTable(tree, filename=options.filename_tables) plot.setDecoratorHorizontalBranches(b) current_tree += 1 if len(branch_length_annotations) == 1: b = branch_length_annotations[0] elif len(branch_length_annotations) == 2: b1, b2 = branch_length_annotations b1.setFontColour(SVGTree.BLUE) b2.setFontColour(SVGTree.RED) b = SVGTree.BranchDecoratorHorizontalAboveBelow( master_tree, b1, b2) elif len(branch_length_annotations) > 2: raise "obtained more than three branch length annotations. Layout not implemented" plot.setDecoratorHorizontalBranches(b) plot.initializePlot() plot.writeToFile(sys.stdout) E.Stop()
print E.GetHeader() print E.GetParams() keys = {} if param_apply: infile = open(param_apply, "r") for line in infile: if line[0] == "#": continue a, b = line[:-1].split("\t")[:2] if param_invert: a, b = b, a keys[a] = b nexus = TreeTools.Newick2Nexus(sys.stdin) notu = 0 for tree in nexus.trees: if param_loglevel >= 2: tree.display() for nx in tree.get_terminals(): t1 = tree.node(nx).get_data().taxon if param_create: if t1 not in keys: keys[t1] = "otu%i" % notu notu += 1
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: trees2trees.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"]) parser.add_option("-c", "--output-filename-map", dest="output_filename_map", type="string", help="filename of map to output." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "split"), help="method to use: filter removed trees, while split writes them to individual files. DEFAULT=%default" ) parser.add_option("-d", "--output-pattern", dest="output_pattern", type="string", help="filename pattern for output multiple alignment files." ) parser.add_option("--filter-terminal-max-length", dest="filter_max_length", type="float", help="remove terminal branches with a branch length larger than this." ) parser.add_option("--filter-terminal-min-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-min-length", dest="filter_min_length", type="float", help="remove terminal branches with a branch length smaller than this." ) parser.add_option("--filter-max-length", dest="filter_min_length", type="float", help="remove any branches with a branch length smaller than this." ) parser.add_option("--filter-by-trees", dest="filter_by_trees", type="string", action="append", help="mask branches according to trees. Give filenames with mask trees. These trees need to have the same names and structure as the input trees, but can be in any order." ) parser.add_option("--filter-by-monophyly", dest="filter_by_monophyly", type="string", help="only retain trees where the given taxa are monphyletic. Supply taxa as a comma-separated list." ) parser.add_option("--min-support", dest="min_support", type="float", help="for monophyly filtering, only accept trees with minimum support." ) parser.add_option("--filter-ntaxa", dest="filter_ntaxa", type="int", help="filter by number of taxa." ) parser.add_option("--filter-simple-orthologs", dest="filter_simple_orthologs", action="store_true", help="filter for trees for simple orhtologs. This works by counting the number of taxa." ) parser.add_option("--filter", dest="filter", type="choice", choices=("taxa", "trees"), help="filter removes taxa or whole trees." ) parser.set_defaults( output_pattern="%s.tree", output_filename_map = None, filter_terminal_max_length = None, filter_terminal_min_length = None, filter_max_length = None, filter_min_length = None, method ="split", filter = "taxa", filtered_branch_length = -999, filter_by_trees = [], filter_by_monophyly = None, filter_ntaxa = None, filter_simple_orthologs = None, min_support = 0.0, regex_species = ("^([^|]+)" ), ) (options, args) = E.Start( parser ) nexus = TreeTools.Newick2Nexus( sys.stdin ) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ninput, noutput, nskipped = 0, 0, 0 ndiscarded = 0 ndiscarded_taxa = 0 ndiscarded_branches = 0 extract_species = lambda x: re.search( options.regex_species, x).groups()[0] if options.filter_by_trees: nexus_filter = [] nexus_maps = [] for filename in options.filter_by_trees: nexus_filter.append( TreeTools.Newick2Nexus( open( filename, "r") ) ) trees = nexus_filter[-1].trees if options.loglevel >=1 : options.stdlog.write("# read %i trees for filtering from %s\n" % (len(trees), filename)) nexus_map = {} for x in range( len(trees)): nexus_map[trees[x].name] = x nexus_maps.append( nexus_map ) if options.filter_by_monophyly: monophyly_taxa = options.filter_by_monophyly.split(",") if len(monophyly_taxa) == 0: raise "please supply at least two taxa for the monophyly test." if options.output_filename_map: outfile_map = open(options.output_filename_map, "a" ) else: outfile_map = None for tree in nexus.trees: ninput += 1 id = tree.name has_discarded = False if options.filter_ntaxa != None: ntaxa = len(tree.get_terminals()) if ntaxa != options.filter_ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because number of taxa (%i) different\n" % \ (id, ntaxa ) ) has_discarded = True if options.filter_simple_orthologs: ntaxa = len(tree.get_terminals()) nspecies = len(set(map( lambda x: extract_species(tree.node(x).data.taxon), tree.get_terminals() ))) if nspecies != ntaxa: if options.loglevel >= 2: options.stdlog.write("# tree %s: removed because not a simple ortholog cluster: ntaxa!=nspecies (%i!=%i)\n" % \ (id, ntaxa, nspecies ) ) has_discarded = True if options.filter_terminal_max_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength >= options.filter_terminal_max_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to large: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_terminal_min_length != None: for x in tree.get_terminals(): node = tree.node(x) if node.data.branchlength <= options.filter_terminal_min_length: has_discarded = True ndiscarded_taxa += 1 tree.prune( node.data.taxon ) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed taxon %s because terminal branchlength to small: %s\n" % \ (id, node.data.taxon, str(node.data.branchlength)) ) if options.filter_max_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength >= options.filter_max_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because branchlength to large: %s\n" % \ (id, x, tree.name, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_min_length != None: for x in tree.get_nodes(tree.root): if x == tree.root: continue node = tree.node(x) if node.data.branchlength <= options.filter_min_length: has_discarded = True ndiscarded_branches += 1 if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength too small: %s\n" % \ (id, x, str(node.data.branchlength)) ) node.data.branchlength = options.filtered_branch_length if options.filter_by_trees: found = [] for y in range(len(nexus_maps)): if id in nexus_maps[y]: found.append( (y, nexus_filter[y].trees[nexus_maps[y][id]]) ) if not found: ndiscarded += 1 continue for x in tree.get_nodes(tree.root): if x == tree.root: continue for y, other_tree in found: other_node = other_tree.node( x ) if other_node.data.branchlength == options.filtered_branch_length: node = tree.node(x) if options.loglevel >= 2: options.stdlog.write("# tree %s: removed branch %i because internal branchlength masked by tree %i:%s.\n" % \ (id, x, y, other_tree.name) ) node.data.branchlength = options.filtered_branch_length has_discarded = True ndiscarded_branches += 1 break if options.filter_by_monophyly: terminals = set(map( lambda x: tree.node(x).data.taxon, tree.get_terminals())) for t in monophyly_taxa: if t not in terminals: if options.loglevel >= 2: options.stdlog.write( "taxon %s not in tree %s\n" % (t, tree.name)) nskipped += 1 succ = tree.node(tree.root).succ ## use minimum support at root, if it is not the same (if trees ## are rooted) if len(succ) == 2: m = min( map( lambda x: tree.node(x).data.support, succ) ) for x in succ: tree.node(x).data.support = m if not TreeTools.IsMonophyleticForTaxa( tree, monophyly_taxa, support=options.min_support ): ndiscarded += 1 continue if has_discarded: ndiscarded += 1 if options.filter=="trees" or options.filter_ntaxa: continue if options.method == "split": output_filename = re.sub( "%s", id, options.output_pattern ) dirname = os.path.dirname(output_filename) if dirname and not os.path.exists( dirname ): os.makedirs( dirname ) if not os.path.exists( output_filename ): outfile = open(output_filename, "w" ) outfile.write( TreeTools.Tree2Newick( tree ) + "\n" ) noutput += 1 else: if options.loglevel >= 1: options.stdlog.write("# skipping because output for tree %s already exists: %s\n" % (id, output_filename)) nskipped += 1 continue elif options.method == "filter": options.stdout.write( ">%s\n%s\n" % (tree.name, TreeTools.Tree2Newick( tree )) ) noutput += 1 if outfile_map: for t in TreeTools.GetTaxa( tree ): outfile_map.write( "%s\t%s\n" % (t, id) ) if outfile_map: outfile_map.close() if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i, with_discarded=%i, discarded_taxa=%i, discarded_branches=%i.\n" %\ (ninput, noutput, nskipped, ndiscarded, ndiscarded_taxa, ndiscarded_branches)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-e", "--enumeration", dest="enumeration", type="choice", choices=("monophyletic", "full", "pairwise", "exhaustive", "explicit", "lineage"), help="enumeration of ortholog groups.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("strict", "degenerate", "any", "outgroup", "lineage"), help="sets to extract.") parser.add_option("-s", "--species-set", dest="species_set", type="string", help="comma separated list of species.") parser.add_option("-g", "--outgroups", dest="outgroups", type="string", help="comma separated list of outgroup species.") parser.add_option( "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--reroot", dest="reroot", type="choice", choices=("outgroup", "midpoint"), help="reroot trees before computing sets.") parser.set_defaults( reference_tree=None, enumeration="full", column2org=None, separator="|", species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_summary=None, methods=[], species_set=None, outgroups=None, reroot=None, ) (options, args) = E.Start(parser) if len(options.methods) == 0: options.methods.append("strict") if options.species_set: options.species_set = options.species_set.split(",") options.enumeration = "explicit" ####################################################################### # warning: outgroup method is useless, as it requires # only a single outgroup per tree and the tree rooted # with the outgroup. if "outgroup" in options.methods and not options.outgroups: raise "please supply --outgroups if method 'outgroup' is chosen." if options.outgroups: options.outgroups = options.outgroups.split(",") ######################################################################## ######################################################################## ######################################################################## if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: options.stdlog.write("# reference tree:\n%s\n" % reference_tree.display()) else: reference_tree = None raise ValueError("please supply a reference tree") ######################################################################## ######################################################################## ######################################################################## # read all trees ######################################################################## nexus = TreeTools.Newick2Nexus(sys.stdin) ######################################################################## ######################################################################## ######################################################################## # sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: parseIdentifier(x, options)[0] extract_gene = lambda x: parseIdentifier(x, options)[2] # prune reference tree to species present species_set = set() for tree in nexus.trees: try: species_set = species_set.union( set(map(extract_species, tree.get_taxa()))) except AttributeError: raise "parsing error while extracting species from %s" % str( tree.get_taxa()) TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# reference tree after pruning has %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x for method in options.methods: ################################################################### ################################################################### ################################################################### # print out a list of ortholog clusters ################################################################### writeOrthologSets(options.stdout, nexus, extract_species, extract_gene, options=options, reference_tree=reference_tree, method=method, outgroups=options.outgroups) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2tree.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-d", "--value", dest="value", type="float", help="normalizing value.") parser.add_option( "-m", "--method", dest="methods", type="string", help= """methods to apply [normalize|divide-by-tree|divide-by-tree|rename|set-uniform-branch-length|extract-with-pattern|build-map|remove-pattern|unroot|midpoint-root|balanced-root|add-node-names""" ) parser.add_option("-2", "--filename-tree2", dest="filename_tree2", type="string", help="filename with second tree.") parser.add_option("-o", "--outgroup", dest="outgroup", type="string", help="reroot with outgroup before processing.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameters for methods.") parser.add_option( "-e", "--template-identifier", dest="template_identifier", type="string", help="""template identifier [%default]. A %i is replaced by the position of the sequence in the file.""") parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="""invert map.""") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("max-branch-length", ), help="filter trees") parser.add_option("--output-format", dest="output_format", type="choice", choices=("nh", "nhx"), help=("output format for trees.")) parser.add_option( "-b", "--no-branch-lengths", dest="with_branchlengths", action="store_false", help= """do not write branchlengths. Per default, 0 branch lengths are added.""" ) parser.set_defaults( value=0, methods="", filename_tree2=None, outgroup=None, parameters="", template_identifier="ID%06i", write_map=False, invert_map=False, filter=None, output_format="nh", with_branchlengths=True, ) (options, args) = E.Start(parser, add_pipe_options=True) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") other_trees = [] # read other trees if options.filename_tree2: other_nexus = TreeTools.Newick2Nexus(open(options.filename_tree2, "r")) if len(other_nexus.trees) > 0: other_trees = other_nexus.trees else: other_tree = other_nexus.trees[0] other_trees = [other_tree] lines = sys.stdin.readlines() ntotal, nskipped, ntree = 0, 0, 0 if options.filter: nexus = TreeTools.Newick2Nexus(lines) new_trees = [] value = float(options.parameters[0]) del options.parameters[0] # decision functions: return true, if tree # is to be skipped if options.filter == "max-branch-length": f = lambda x: x >= value for tree in nexus.trees: ntotal += 1 for id, node in tree.chain.items(): if f(node.data.branchlength): nskipped += 1 break else: new_trees.append(tree) ntree += 1 nexus.trees = new_trees options.stdout.write( TreeTools.Nexus2Newick(nexus, with_names=True) + "\n") else: # iterate over chunks chunks = filter(lambda x: lines[x][0] == ">", range(len(lines))) map_old2new = {} if chunks: for c in range(len(chunks) - 1): a, b = chunks[c], chunks[c + 1] options.stdout.write(lines[a]) a += 1 Process(lines[a:b], other_trees, options, map_old2new, ntree) options.stdout.write(lines[chunks[-1]]) t, s, ntree = Process(lines[chunks[-1] + 1:], other_trees, options, map_old2new, ntree) ntotal += t nskipped += s else: ntotal, nskipped, ntree = Process(lines, other_trees, options, map_old2new, ntree) if options.write_map: p = options.parameters[0] if p: outfile = open(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in map_old2new.items(): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, nskipped=%i\n" % (ntotal, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extractspecies from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--filename-filter-positives", dest="filename_filter_positives", type="string", help="filename with positive list of trees to analyze.") parser.add_option("-s", "--filename-species-tree", dest="filename_species_tree", type="string", help="filename with species tree.") parser.add_option( "--filename-species2colour", dest="filename_species2colour", type="string", help= "filename with map of species to colours. If not given, random colours are assigned to species." ) parser.add_option("-t", "--species-tree", dest="species_tree", type="string", help="species tree.") parser.add_option( "-e", "--filename-locations", dest="filename_locations", type="string", help= "filename with map of transcript information to location information.") parser.add_option("--no-create", dest="create", action="store_false", help="do not create files, but append to them.") parser.add_option( "--max-separation", dest="max_separation", type="int", help= "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)." ) parser.add_option( "--filename-species2url", dest="filename_species2url", type="string", help="filename with mapping information of species to URL.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add as first column.") parser.add_option( "--outgroup-species", dest="outgroup_species", type="string", help="species to used as outgroups. Separate multiple species by ','.") parser.add_option("--subtrees-trees", dest="subtrees_trees", action="store_true", help="write trees for subtrees.") parser.add_option("--subtrees-identifiers", dest="subtrees_identifiers", action="store_true", help="write identifiers of subtrees.") parser.add_option("--svg-add-ids", dest="svg_add_ids", action="store_true", help="add node ids to svg plot.") parser.add_option("--svg-otus", dest="svg_otus", type="string", help="otus to output in svg species tree.") parser.add_option("--svg-branch-lenghts", dest="svg_branch_lengths", type="choice", choices=("contemporary", "uniform", "median"), help="branch lengths in species tree.") parser.add_option("--print-totals", dest="print_totals", action="store_true", help="output totals sections.") parser.add_option("--print-subtotals", dest="print_subtotals", action="store_true", help="output subtotals sections.") parser.add_option( "--print-best", dest="print_best", action="store_true", help="output best node assignment for each node in gene tree.") parser.add_option("--print-svg", dest="print_svg", action="store_true", help="output svg files.") parser.add_option("--print-species-svg", dest="print_species_svg", action="store_true", help="output species svg files.") parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= """output pattern for separate output of sections [default: %default]. Set to None, if output to stdout. Can contain one %s to be substituted with section.""" ) parser.add_option( "--output-pattern-svg", dest="output_pattern_svg", type="string", help= "filename for svg output. If it contains %s, this is replaced by gene_tree name." ) parser.add_option( "--filename-node-types", dest="filename_node_types", type="string", help="filename with node type information from a previous run.") parser.add_option("--analyze-resolution-data", dest="analyze_resolution_data", type="choice", action="append", choices=("stats", "histograms"), help="stdin is resolution data.") parser.add_option("--filter-quality", dest="filter_quality", type="choice", choices=("all", "genes", "pseudogenes"), help="filter predictions by gene type.") parser.add_option("--filter-location", dest="filter_location", type="choice", choices=("all", "local", "non-local", "cis", "unplaced"), help="filter predictions by location.") parser.add_option("--remove-unplaced", dest="remove_unplaced", action="store_true", help="remove predictions on unplaced contigs.") parser.add_option("--skip-without-outgroups", dest="skip_without_outgroups", action="store_true", help="skip clusters without outgroups.") parser.set_defaults( filter_quality="all", filter_location="all", remove_unplaced=False, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_species_tree=None, priority={ "Speciation": 0, "SpeciationDeletion": 1, "Transcripts": 2, "DuplicationLineage": 3, "Duplication": 4, "DuplicationDeletion": 5, "DuplicationInconsistency": 6, "Outparalogs": 7, "InconsistentTranscripts": 8, "Inconsistency": 9, "Masked": 10 }, species_tree=None, filename_species2colour=None, filename_locations=None, max_separation=0, filename_species2url=None, separator="|", prefix=None, output_pattern=None, output_pattern_svg=None, outgroup_species=None, svg_add_ids=False, svg_branch_lengths="median", svg_otus=None, subtrees=False, print_svg=False, print_subtotals=False, print_totals=False, print_best=False, subtrees_identifiers=False, create=True, min_branch_length=0.00, filename_node_types=None, format_branch_length="%6.4f", nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"), analyze_resolution_data=None, warning_small_branch_length=0.01, filename_filter_positives=None, skip_without_outgroups=False, ) (options, args) = E.Start(parser, add_psql_options=True, add_csv_options=True) if options.outgroup_species: options.outgroup_species = set(options.outgroup_species.split(",")) if options.svg_otus: options.svg_otus = set(options.svg_otus.split(",")) rx_species = re.compile(options.species_regex) extract_species = lambda x: rx_species.match(x).groups()[0] if options.gene_regex: rx_gene = re.compile(options.gene_regex) extract_gene = lambda x: rx_gene.match(x).groups()[0] else: extract_gene = None extract_quality = lambda x: x.split(options.separator)[3] ######################################################################### ######################################################################### ######################################################################### # read positive list of malis ######################################################################### if options.filename_filter_positives: filter_positives, nerrors = IOTools.ReadList( open(options.filename_filter_positives, "r")) filter_positives = set(filter_positives) else: filter_positives = None ######################################################################### ######################################################################### ######################################################################### # read location info ######################################################################### if options.filename_locations: map_id2location = TreeReconciliation.readLocations( open(options.filename_locations, "r"), extract_species) else: map_id2location = {} if (options.remove_unplaced or options.filter_location != "all" ) and not options.filename_locations: raise "please supply a file with location information." ######################################################################### ######################################################################### ######################################################################### # delete output files ######################################################################### if options.create and options.output_pattern: for section in ("details", "subtrees", "subids", "details", "trees", "nodes", "categories"): fn = options.output_pattern % section if os.path.exists(fn): if options.loglevel >= 1: options.stdlog.write("# deleting file %s.\n" % fn) os.remove(fn) if options.loglevel >= 1: options.stdlog.write("# reading gene trees.\n") options.stdlog.flush() gene_nexus = TreeTools.Newick2Nexus(sys.stdin) Tree.updateNexus(gene_nexus) if options.loglevel >= 1: options.stdlog.write("# read %i gene trees from stdin.\n" % len(gene_nexus.trees)) options.stdlog.flush() ######################################################################### ######################################################################### ######################################################################### # main loop over gene trees ######################################################################### ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0 nskipped_filter, nskipped_outgroups = 0, 0 # total counts total_heights_per_species = {} total_relheights_per_species = {} total_heights_per_tree = [] total_relheights_per_tree = [] for gene_tree in gene_nexus.trees: ninput += 1 xname = re.sub("_tree.*", "", gene_tree.name) xname = re.sub("subtree_", "", xname) if filter_positives and xname not in filter_positives: nskipped_filter += 1 continue if options.loglevel >= 6: gene_tree.display() ####################################################################### ####################################################################### ####################################################################### # get identifier for this tree and update prefixes accordingly ####################################################################### if options.prefix: if len(gene_nexus.trees) > 0: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + gene_tree.name + "\t" prefix_prefix = options.prefix + "_" + gene_tree.name + "_" prefix_name = options.prefix + "_" + gene_tree.name else: prefix_header = "prefix\t" prefix_row = options.prefix + "\t" prefix_prefix = options.prefix + "_" prefix_name = options.prefix else: if len(gene_nexus.trees) > 0: prefix_header = "prefix\t" prefix_row = gene_tree.name + "\t" prefix_prefix = gene_tree.name + "\t" prefix_name = gene_tree.name else: prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", "" ####################################################################### ####################################################################### ####################################################################### # apply filters to gene tree ####################################################################### TreeReconciliation.filterTree(gene_tree, options, map_id2location) otus = TreeTools.GetTaxa(gene_tree) if len(otus) <= 1: nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty after filtering - skipped.\n" % gene_tree.name) continue this_species_list = map(extract_species, otus) # check, if only outgroups if options.outgroup_species: if not set(this_species_list).difference(options.outgroup_species): nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: only outgroups after filtering - skipped.\n" % gene_tree.name) continue if options.skip_without_outgroups and not set( this_species_list).intersection(options.outgroup_species): nskipped_outgroups += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: no outgroups - skipped.\n" % gene_tree.name) continue ####################################################################### ####################################################################### ####################################################################### # reroot gene tree, if outgroups have been given. ####################################################################### if options.outgroup_species: TreeReconciliation.rerootTree(gene_tree, extract_species, options) ####################################################################### ####################################################################### ####################################################################### # compute distance to root for each node ####################################################################### distance_to_root = TreeTools.GetDistanceToRoot(gene_tree) ####################################################################### ####################################################################### ####################################################################### # compute counts ####################################################################### # heights per tree heights_per_tree = [] # relative heights per tree relheights_per_tree = [] # distance to root heights_per_species = {} # distance to root (relative to maximum distance to root) relheights_per_species = {} analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets( gene_tree, extract_quality, options) if len(analysis_set) == 0: if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty analysis set - skipped.\n" % gene_tree.name) nskipped += 1 continue reference_height = TreeReconciliation.getReferenceHeight( distance_to_root, gene_tree, gene_set, options, extract_species, method="median") if reference_height is None: if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference height not computable or 0 - skipped.\n" % gene_tree.name) nskipped += 1 continue for node_id in analysis_set: node = gene_tree.node(node_id) species = extract_species(node.data.taxon) height = distance_to_root[node_id] if height < options.warning_small_branch_length: options.stdlog.write( "# tree %s: small distance %s to root at node %i: %s\n" % (gene_tree.name, options.format_branch_length % height, node_id, node.data.taxon)) relheight = height / reference_height try: heights_per_species[species].append(height) except KeyError: heights_per_species[species] = [height] relheights_per_species[species] = [] relheights_per_species[species].append(relheight) # do not use outgroup species if options.outgroup_species and species in options.outgroup_species: continue heights_per_tree.append(height) relheights_per_tree.append(relheight) if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference_height=%s\n" % (gene_tree.name, options.format_branch_length % reference_height)) options.stdlog.flush() if options.print_subtotals: printCounts(heights_per_species, relheights_per_species, heights_per_tree, relheights_per_tree, options, prefix_header, prefix_row) ####################################################################### ####################################################################### ####################################################################### # update total counts ####################################################################### TreeReconciliation.appendCounts(total_heights_per_species, heights_per_species) TreeReconciliation.appendCounts(total_relheights_per_species, relheights_per_species) TreeReconciliation.appendCounts(total_heights_per_tree, heights_per_tree) TreeReconciliation.appendCounts(total_relheights_per_tree, relheights_per_tree) noutput += 1 if options.print_totals: if options.prefix: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + "total" + "\t" prefix_prefix = options.prefix + "_" + "total" + "_" prefix_name = options.prefix + "_" + "total" else: prefix_header = "prefix\t" prefix_row = "total" + "\t" prefix_prefix = "total" + "_" prefix_name = "total" printCounts(total_heights_per_species, total_relheights_per_species, total_heights_per_tree, total_relheights_per_tree, options, prefix_header, prefix_row) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n" % (ninput, nfiltered, nskipped, nskipped_filter, nskipped_outgroups, noutput)) E.Stop()
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float( other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree)) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary( map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search( node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick( nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n") else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("map", "links", "trees"), help="output format.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option( "-s", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.set_defaults( reference_tree=None, format="map", filename_patterns=None, column2org=None, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", separator="|", filename_summary=None, ) (options, args) = E.Start(parser) if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() else: reference_tree = None clusters = {} if options.format == "map": for line in sys.stdin: if line[0] == "#": continue id, r = line[:-1].split("\t") if r not in clusters: clusters[r] = [] clusters[r].append(id) elif options.format == "trees": nexus = TreeTools.Newick2Nexus(sys.stdin) for tree in nexus.trees: clusters[tree.name] = tree.get_taxa() elif options.format == "links": members = set() id = None for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": if id: clusters[id] = members x = re.match(">cluster #(\d+)", line[:-1]) if x: id = x.groups()[0] else: id = line[1:-1] members = set() continue data = line[:-1].split("\t")[:2] members.add(data[0]) members.add(data[1]) if id: clusters[id] = members if len(clusters) == 0: raise "empty input." ######################################################################## ######################################################################## ######################################################################## ## sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: rs.search(x).groups()[0] ## prune tree to species present species_set = set() for cluster, members in clusters.items(): species_set = species_set.union(set(map(extract_species, members))) if reference_tree: TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) else: options.column2org = [] for x in species_set: options.column2org.append(x) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x if reference_tree: reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org) if options.loglevel >= 3: print "# reference patterns:" print reference_patterns ############################################################################## notus = len(options.column2org) patterns = {} species_counts = [SpeciesCounts() for x in options.column2org] ## first genes, then transcripts options.stdout.write( "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org), "\t".join(options.column2org))) keys = clusters.keys() keys.sort() for cluster in keys: members = clusters[cluster] count_genes = [{} for x in range(len(options.org2column))] count_transcripts = [0] * len(options.org2column) for m in members: data = m.split(options.separator) if len(data) == 4: s, t, g, q = data elif len(data) == 2: s, g = data t = g if s not in options.org2column: raise "unknown species %s" % s col = options.org2column[s] count_transcripts[col] += 1 if g not in count_genes[col]: count_genes[col][g] = 0 count_genes[col][g] += 1 species_counts[col].mGenes.add(g) species_counts[col].mTranscripts.add(t) species_counts[col].mTrees.add(cluster) ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts) npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts)) ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes)) npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes))) pattern = GetPattern(count_transcripts, notus) if pattern not in patterns: patterns[pattern] = 0 patterns[pattern] += 1 options.stdout.write( string.join( (cluster, pattern, str(npresent_genes), str(ntotal_genes), string.join(map(str, map(len, count_genes)), "\t"), str(ntotal_transcripts), string.join(map(str, count_transcripts), "\t")), "\t") + "\n") ####################################################################################### ####################################################################################### ####################################################################################### ## write pattern summary ####################################################################################### xx = patterns.keys() xx.sort() if options.filename_patterns: outfile = open(options.filename_patterns, "w") else: outfile = sys.stdout for x in range(len(options.column2org)): outfile.write("# %i = %s\n" % (x, options.column2org[x])) if reference_tree: outfile.write("pattern\tcounts\tisok\n") else: outfile.write("pattern\tcounts\n") for x in xx: if reference_tree: if x in reference_patterns: is_ok = "1" else: is_ok = "0" outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok)) else: outfile.write("%s\t%s\n" % (x, patterns[x])) if outfile != sys.stdout: outfile.close() ####################################################################################### ####################################################################################### ####################################################################################### ## write summary counts per species ####################################################################################### if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = sys.stdout outfile.write("species\tntranscripts\tngenes\tntrees\n") for species, col in options.org2column.items(): outfile.write( "%s\t%i\t%i\t%i\n" % (species, len(species_counts[col].mTranscripts), len(species_counts[col].mGenes), len(species_counts[col].mTrees))) if outfile != sys.stdout: outfile.close() E.Stop()
def AnalysePatterns(patterns, map_id2org, min_cluster_support=100, min_report_support=90): """analyse partitions by comparing to reference tree. Prints out for each partition, whether left/right is consistent with reference tree or not. If there are full complements on either side, print suggested split. Prints summary statistics: for each consistent partition: print counts """ # reread and process species tree # has to be done for every new pass, because # the tree is modified later on (and I haven't found # a copy mechanism (because I did not look)). nexus = TreeTools.Newick2Nexus(param_reference_tree) reference_tree = nexus.trees[0] norgs = len(reference_tree.get_terminals()) notus = len(patterns[0][1]) # complement patterns with single species patterns: patterns.reverse() for x in range(notus): pattern = ["."] * notus pattern[x] = "*" patterns.append((100, string.join(pattern, ""))) patterns.reverse() ########################################################################## # first pass: separate well supported full species trees masks = [] present_orgs = {} mask_id = 0 for support, pattern in patterns: t1, t2, i1, i2 = {}, {}, [], [] for x in range(len(pattern)): org, name, nid = map_id2org[x] if org == "unknown": continue present_orgs[org] = 1 if pattern[x] == "*": t1[org] = 1 i1.append(name) else: t2[org] = 1 i2.append(name) t1 = t1.keys() t2 = t2.keys() t1.sort() t2.sort() if param_loglevel >= 4: print "# ", pattern, len(t1), len(t2), i1, i2 sys.stdout.flush() if len(t1) == len(t2) and \ len(t1) == norgs and \ support >= min_cluster_support: mask1, notus1 = [], 0 mask2, notus2 = [], 0 for x in range(len(pattern)): if pattern[x] == "*": notus1 += 1 mask1.append(1) mask2.append(0) else: notus2 += 1 mask1.append(0) mask2.append(1) mask_id += 1 masks.append(Results(mask1, notus1, len(t1), mask_id=mask_id)) mask_id += 1 masks.append(Results(mask2, notus2, len(t2), mask_id=mask_id)) if param_loglevel >= 2: print "# split\tfull\t%i\t%s\t%i\t%i\t%s" % ( support, string.join(map(str, mask1), ""), notus1, len(t1), string.join(i1, ";")) print "# split\tfull\t%i\t%s\t%i\t%i\t%s" % ( support, string.join(map(str, mask2), ""), notus2, len(t2), string.join(i2, ";")) # add full mask if len(masks) == 0: masks.append(Results([1] * notus, notus, len(present_orgs), mask_id=1)) ########################################################################## # second pass: check subtrees for each mask # external: edges leading to external nodes (i.e., leaves): total number = norgs # internal: all other edges: maximum number = 2 * (2 * norgs - 3 - norgs) = 2 * (norgs - 3) # 1st factor 2: two directions # 2nd factor: 2n-3 is number of edges in unrooted tree. # 3rd factor: -n = number of external edges for mask in masks: reference_tree = GetPrunedReferenceTree(mask, GetOrgs(map_id2org), param_reference_tree) AnalyseMask(mask, patterns, norgs, reference_tree, map_id2org, min_report_support) if param_loglevel >= 1: print "# partitions after evaluation:" print "#", Results().printHeader() for m in masks: print "#", str(m) reference_tree = GetPrunedReferenceTree(mask, GetOrgs(map_id2org), param_reference_tree) new_masks = SelectMasks(masks, patterns, norgs, map_id2org, min_report_support) if param_loglevel >= 1: print "# partitions after selection:" print "#", Results().printHeader() for m in new_masks: print "#", str(m) return new_masks