def filterTree(tree, options, map_id2location=None): """apply location and type filter to tree. if outgroups are defined, they are not removed. """ otus = TreeTools.GetTaxa(tree) to_remove = set() if options.remove_unplaced: tt = set() for id in otus: if id not in map_id2location: if options.loglevel >= 1: options.stdlog.write( "# WARNING: unknown location for id %s.\n" % id) continue if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK: to_remove.add(id) tt.add(id) if options.loglevel >= 3: options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" % (tree.name, len(tt), ";".join(tt))) new_otus = list(set(otus).difference(to_remove)) if len(new_otus) != len(otus): TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True) if options.loglevel >= 1: options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" % (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree)))) options.stdlog.flush()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("map", "links", "trees"), help="output format.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option( "-s", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.set_defaults( reference_tree=None, format="map", filename_patterns=None, column2org=None, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", separator="|", filename_summary=None, ) (options, args) = E.Start(parser) if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() else: reference_tree = None clusters = {} if options.format == "map": for line in sys.stdin: if line[0] == "#": continue id, r = line[:-1].split("\t") if r not in clusters: clusters[r] = [] clusters[r].append(id) elif options.format == "trees": nexus = TreeTools.Newick2Nexus(sys.stdin) for tree in nexus.trees: clusters[tree.name] = tree.get_taxa() elif options.format == "links": members = set() id = None for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": if id: clusters[id] = members x = re.match(">cluster #(\d+)", line[:-1]) if x: id = x.groups()[0] else: id = line[1:-1] members = set() continue data = line[:-1].split("\t")[:2] members.add(data[0]) members.add(data[1]) if id: clusters[id] = members if len(clusters) == 0: raise "empty input." ######################################################################## ######################################################################## ######################################################################## ## sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: rs.search(x).groups()[0] ## prune tree to species present species_set = set() for cluster, members in clusters.items(): species_set = species_set.union(set(map(extract_species, members))) if reference_tree: TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) else: options.column2org = [] for x in species_set: options.column2org.append(x) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x if reference_tree: reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org) if options.loglevel >= 3: print "# reference patterns:" print reference_patterns ############################################################################## notus = len(options.column2org) patterns = {} species_counts = [SpeciesCounts() for x in options.column2org] ## first genes, then transcripts options.stdout.write( "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org), "\t".join(options.column2org))) keys = clusters.keys() keys.sort() for cluster in keys: members = clusters[cluster] count_genes = [{} for x in range(len(options.org2column))] count_transcripts = [0] * len(options.org2column) for m in members: data = m.split(options.separator) if len(data) == 4: s, t, g, q = data elif len(data) == 2: s, g = data t = g if s not in options.org2column: raise "unknown species %s" % s col = options.org2column[s] count_transcripts[col] += 1 if g not in count_genes[col]: count_genes[col][g] = 0 count_genes[col][g] += 1 species_counts[col].mGenes.add(g) species_counts[col].mTranscripts.add(t) species_counts[col].mTrees.add(cluster) ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts) npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts)) ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes)) npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes))) pattern = GetPattern(count_transcripts, notus) if pattern not in patterns: patterns[pattern] = 0 patterns[pattern] += 1 options.stdout.write( string.join( (cluster, pattern, str(npresent_genes), str(ntotal_genes), string.join(map(str, map(len, count_genes)), "\t"), str(ntotal_transcripts), string.join(map(str, count_transcripts), "\t")), "\t") + "\n") ####################################################################################### ####################################################################################### ####################################################################################### ## write pattern summary ####################################################################################### xx = patterns.keys() xx.sort() if options.filename_patterns: outfile = open(options.filename_patterns, "w") else: outfile = sys.stdout for x in range(len(options.column2org)): outfile.write("# %i = %s\n" % (x, options.column2org[x])) if reference_tree: outfile.write("pattern\tcounts\tisok\n") else: outfile.write("pattern\tcounts\n") for x in xx: if reference_tree: if x in reference_patterns: is_ok = "1" else: is_ok = "0" outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok)) else: outfile.write("%s\t%s\n" % (x, patterns[x])) if outfile != sys.stdout: outfile.close() ####################################################################################### ####################################################################################### ####################################################################################### ## write summary counts per species ####################################################################################### if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = sys.stdout outfile.write("species\tntranscripts\tngenes\tntrees\n") for species, col in options.org2column.items(): outfile.write( "%s\t%i\t%i\t%i\n" % (species, len(species_counts[col].mTranscripts), len(species_counts[col].mGenes), len(species_counts[col].mTrees))) if outfile != sys.stdout: outfile.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-e", "--enumeration", dest="enumeration", type="choice", choices=("monophyletic", "full", "pairwise", "exhaustive", "explicit", "lineage"), help="enumeration of ortholog groups.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("strict", "degenerate", "any", "outgroup", "lineage"), help="sets to extract.") parser.add_option("-s", "--species-set", dest="species_set", type="string", help="comma separated list of species.") parser.add_option("-g", "--outgroups", dest="outgroups", type="string", help="comma separated list of outgroup species.") parser.add_option( "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--reroot", dest="reroot", type="choice", choices=("outgroup", "midpoint"), help="reroot trees before computing sets.") parser.set_defaults( reference_tree=None, enumeration="full", column2org=None, separator="|", species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_summary=None, methods=[], species_set=None, outgroups=None, reroot=None, ) (options, args) = E.Start(parser) if len(options.methods) == 0: options.methods.append("strict") if options.species_set: options.species_set = options.species_set.split(",") options.enumeration = "explicit" ####################################################################### # warning: outgroup method is useless, as it requires # only a single outgroup per tree and the tree rooted # with the outgroup. if "outgroup" in options.methods and not options.outgroups: raise "please supply --outgroups if method 'outgroup' is chosen." if options.outgroups: options.outgroups = options.outgroups.split(",") ######################################################################## ######################################################################## ######################################################################## if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: options.stdlog.write("# reference tree:\n%s\n" % reference_tree.display()) else: reference_tree = None raise ValueError("please supply a reference tree") ######################################################################## ######################################################################## ######################################################################## # read all trees ######################################################################## nexus = TreeTools.Newick2Nexus(sys.stdin) ######################################################################## ######################################################################## ######################################################################## # sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: parseIdentifier(x, options)[0] extract_gene = lambda x: parseIdentifier(x, options)[2] # prune reference tree to species present species_set = set() for tree in nexus.trees: try: species_set = species_set.union( set(map(extract_species, tree.get_taxa()))) except AttributeError: raise "parsing error while extracting species from %s" % str( tree.get_taxa()) TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# reference tree after pruning has %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x for method in options.methods: ################################################################### ################################################################### ################################################################### # print out a list of ortholog clusters ################################################################### writeOrthologSets(options.stdout, nexus, extract_species, extract_gene, options=options, reference_tree=reference_tree, method=method, outgroups=options.outgroups) E.Stop()
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float( other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree)) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary( map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search( node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick( nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n") else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def prepareRun(self): self.__reset() self.mTempdir = tempfile.mkdtemp() # self.mTempdir = "tmp" if not os.path.exists(self.mTempdir): os.mkdir(self.mTempdir) if self.mInputMatrix and self.mInputData: raise ValueError( "please specify either input matrix or input data, but not both." ) # prepare input matrix. Should already be in phylip like # format, but long identifiers are shortened and tabs are # replaced by spaces. if self.mInputMatrix: outfile = open(self.mTempdir + "/infile", "w") identifiers = map(lambda x: re.split("\s+", x[:-1])[0], self.mInputMatrix[1:]) self.updateMaps(identifiers) outfile.write(self.mInputMatrix[0]) for line in self.mInputMatrix[1:]: data = re.split("\s+", line[:-1]) new_line = self.mMapInput2Phylip[ data[0]] + " " + " ".join(data[1:]) outfile.write(new_line + "\n") outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") elif self.mInputData: outfile = open(self.mTempdir + "/infile", "w") outfile.write("%i %i\n" % (len(self.mInputData), len(self.mInputData[0]) - 1)) identifiers = map(lambda x: x[0], self.mInputData) self.updateMaps(identifiers) for x in range(len(identifiers)): outfile.write("%-10s %s\n" % (self.mMapInput2Phylip[identifiers[x]], " ".join( self.mInputData[x][1:]))) outfile.close() if self.mLogLevel >= 1: print "# written input matrix with %i taxa to %s" % ( len(identifiers), self.mTempdir + "/infile") os.system("cat %s" % self.mTempdir + "/infile") # prepare input tree or trees self.mNInputTrees = 0 if self.mInputTree or self.mInputTrees: outfile = open(self.mTempdir + "/intree", "w") if self.mInputTree and self.mInputTrees: raise UsageError( "please supply either one or mupltiple trees, but not both." ) if self.mInputTree: trees = [self.mInputTree] else: trees = self.mInputTrees for tree in trees: if self.mPruneTree: taxa = self.mMapInput2Phylip.keys() TreeTools.PruneTree(tree, taxa) taxa = TreeTools.GetTaxa(tree) self.updateMaps(taxa) TreeTools.MapTaxa(tree, self.mMapInput2Phylip) # check if taxa are unique taxa = tree.get_taxa() staxa = set() skip = False for t in taxa: if t in staxa: if self.mLogLevel >= 1: print "# skipping tree %s because of duplicate taxa." % ( tree.name) skip = True staxa.add(t) if skip: continue outfile.write(TreeTools.Tree2Newick(tree) + "\n") self.mNInputTrees += 1 if self.mLogLevel >= 1: print "# written input tree with %i taxa to %s" % (len( TreeTools.GetTaxa(tree)), self.mTempdir + "/intree") print "#", TreeTools.Tree2Newick(tree) outfile.close() # prepare input multiple alignment if self.mInputMali: if self.mInputMatrix: raise "both mali and matrix supplied - infile conflict." outfile = open(self.mTempdir + "/infile", "w") identifiers = self.mInputMali.getIdentifiers() self.updateMaps(identifiers) self.mInputMali.mapIdentifiers(self.mMapInput2Phylip) self.mInputMali.writeToFile(outfile, format="phylip") outfile.close() if self.mLogLevel >= 1: print "# written input multiple alignments with %i taxa and with %i to %s" %\ (self.mInputMali.getLength(), self.mInputMali.getWidth(), self.mTempdir + "/intree")