def testReroot(self): tree = NX.DiGraph() edges = [(1,2), (1,3), (2,4), (2,5), (5,6), (5,7), (5,8)] for edge in edges: tree.add_edge(edge[0], edge[1]) nxTree = NXTree(tree) nxTree.setWeight(1, 2, 3.1) to = [x for x in nxTree.breadthFirstTraversal()] testTree = copy.deepcopy(nxTree) testTree.reroot(1) t = [x for x in testTree.breadthFirstTraversal()] assert t == to testTree = copy.deepcopy(nxTree) testTree.reroot(2) t = [x for x in testTree.breadthFirstTraversal()] assert t[0] == 2 assert sorted(t[1:4]) == [1,4,5] assert sorted(t[4:]) == [3,6,7,8] assert testTree.getWeight(2, 1) == 3.1 testTree = copy.deepcopy(nxTree) testTree.reroot(7) t = [x for x in testTree.breadthFirstTraversal()] assert t[0] == 7 assert t[1] == 5 assert sorted(t[2:5]) == [2,6,8] assert sorted(t[5:7]) == [1,4] assert t[7] == 3
def testReroot(self): tree = NX.DiGraph() edges = [(1, 2), (1, 3), (2, 4), (2, 5), (5, 6), (5, 7), (5, 8)] for edge in edges: tree.add_edge(edge[0], edge[1]) nxTree = NXTree(tree) nxTree.setWeight(1, 2, 3.1) to = [x for x in nxTree.breadthFirstTraversal()] testTree = copy.deepcopy(nxTree) testTree.reroot(1) t = [x for x in testTree.breadthFirstTraversal()] assert t == to testTree = copy.deepcopy(nxTree) testTree.reroot(2) t = [x for x in testTree.breadthFirstTraversal()] assert t[0] == 2 assert sorted(t[1:4]) == [1, 4, 5] assert sorted(t[4:]) == [3, 6, 7, 8] assert testTree.getWeight(2, 1) == 3.1 testTree = copy.deepcopy(nxTree) testTree.reroot(7) t = [x for x in testTree.breadthFirstTraversal()] assert t[0] == 7 assert t[1] == 5 assert sorted(t[2:5]) == [2, 6, 8] assert sorted(t[5:7]) == [1, 4] assert t[7] == 3
def parseString(self, newickString, addImpliedRoots=True): self.nxTree = NXTree() self.inString = self.__filterWhitespace(newickString) self.__createBracketTable() self.nextId = 0 assert self.inString[-1] == ';' self.__addNode(0, len(self.inString) - 1, None, addImpliedRoots) self.nxTree.isTree() return self.nxTree
def testTraversals(self): for tree in self.trees: nxTree = NXTree(tree) dfs = [x for x in nxTree.preOrderTraversal()] assert len(set(dfs)) == len(nxTree.nxDg.nodes()) dfs = [x for x in nxTree.postOrderTraversal()] assert len(set(dfs)) == len(nxTree.nxDg.nodes()) bfs = [x for x in nxTree.breadthFirstTraversal()] assert len(set(bfs)) == len(nxTree.nxDg.nodes())
def starTree(self): self.tree = NXTree() label = 0 self.tree.nxDg.add_node(label) self.tree.rootId = label for name in self.pathMap.keys(): label += 1 self.tree.nxDg.add_edge(0, label) self.tree.setName(label, name) self.tree.setWeight(0, label, SeqFile.branchLen)
def __init__(self, tree=None): if isinstance(tree, NXTree): NXTree.__init__(self, tree.nxDg) else: NXTree.__init__(self, tree) # ids of all subtree roots for fast checking self.subtreeRoots = set() # map of names to node ids self.nameToId = dict() for node in self.breadthFirstTraversal(): if self.hasName(node): self.nameToId[self.getName(node)] = node
def __init__(self, tree = None, subtreeSize = 2): if isinstance(tree, NXTree): NXTree.__init__(self, tree.nxDg) else: NXTree.__init__(self, tree) # ids of all subtree roots for fast checking self.subtreeRoots = set() # map of names to node ids self.nameToId = dict() for node in self.breadthFirstTraversal(): if self.hasName(node): self.nameToId[self.getName(node)] = node # size a subtree (in number of leaves) self.subtreeSize = subtreeSize
def parseString(self, newickString, addImpliedRoots = True): self.nxTree = NXTree() self.inString = self.__filterWhitespace(newickString) self.__createBracketTable() self.nextId = 0 assert self.inString[-1] == ';' self.__addNode(0, len(self.inString)-1, None, addImpliedRoots) self.nxTree.isTree() return self.nxTree
class NXNewick(object): def __init__(self, nxTree=None): self.nxTree = None self.bracketMatch = None self.inString = None self.nextId = 0 self.outString = None def parseFile(self, path): inFile = open(path) self.parseString(inFile.read()) inFile.close() return self.nxTree def parseString(self, newickString, addImpliedRoots=True): self.nxTree = NXTree() self.inString = self.__filterWhitespace(newickString) self.__createBracketTable() self.nextId = 0 assert self.inString[-1] == ';' self.__addNode(0, len(self.inString) - 1, None, addImpliedRoots) self.nxTree.isTree() return self.nxTree def writeString(self, nxTree=None): if nxTree: self.nxTree = nxTree self.outString = "" self.__writeNode(self.nxTree.getRootId(), None) self.outString += ";" return self.outString def writeFile(self, path, nxTree=None): outFile = open(path, "w") outFile.write(self.writeString(nxTree)) outFile.write("\n") outFile.close() return None #### PRIVATE WRITING FUNCTIONS #### def __writeNode(self, node, parent=None): children = self.nxTree.getChildren(node) if len(children) > 0: self.outString += "(" for child in children: self.__writeNode(child, node) if child != children[-1]: self.outString += "," if len(children) > 0: self.outString += ")" name = self.nxTree.getName(node) if len(name) > 0: containsSpace = True in [c1 in name for c1 in ws] if containsSpace: self.outString += "\"" self.outString += name if containsSpace: self.outString += "\"" if parent is not None: weight = self.nxTree.getWeight(parent, node, defaultValue=None) if weight is not None: self.outString += ":%s" % str(weight) #### PRIVATE READING FUNCTIONS #### def __filterWhitespace(self, newickString): filteredString = "" inQuote = False for c in newickString: if c == "\'" or c == "\"": inQuote = not inQuote elif inQuote or c not in ws: filteredString += c return filteredString def __createBracketTable(self): bracketStack = [] self.bracketMatch = dict() index = 0 for c in self.inString: if c == '(': bracketStack.append(index) elif c == ')': leftIndex = bracketStack.pop() self.bracketMatch[leftIndex] = index index += 1 assert len(bracketStack) == 0 def __childRanges(self, start, length): ranges = [] currentStart = start i = currentStart while i < start + length + 1: if self.inString[i] == ',' or i == start + length: ranges.append((currentStart, i - currentStart)) currentStart = i + 1 if i in self.bracketMatch: i = self.bracketMatch[i] i += 1 return ranges def __parseName(self, nameString): if nameString == ';': return ('', '') tokens = nameString.split(':') assert len(tokens) == 1 or len(tokens) == 2 name = tokens[0] weight = '' if len(tokens) == 2: weight = tokens[1] return (name, weight) def __addNode(self, start, length, parent=None, addImpliedRoots=True): # parse the children (..,...,..) children = [] if self.inString[start] == '(': assert start in self.bracketMatch chLength = self.bracketMatch[start] - start - 1 children = self.__childRanges(start + 1, chLength) start = self.bracketMatch[start] + 1 length -= (chLength + 2) # prase the name abc:123 name, weight = self.__parseName(self.inString[start:start + length]) id = self.nextId self.nextId += 1 self.nxTree.nxDg.add_node(id) if len(name) > 0: self.nxTree.nxDg.node[id]['name'] = name #update the graph if parent is not None: self.nxTree.nxDg.add_edge(parent, id) if len(weight) > 0: self.nxTree.nxDg[parent][id]['weight'] = float(weight) #update the root (implied roots are added as a new node) if self.nxTree.getRootId() is None: assert parent is None root = id if len(weight) > 0: if addImpliedRoots: root = self.nextId self.nextId += 1 self.nxTree.nxDg.add_edge(root, id) self.nxTree.setWeight(root, id, weight) self.nxTree.rootId = root # recurse on children for child in children: self.__addNode(child[0], child[1], id)
class NXNewick(object): def __init__(self, nxTree = None): self.nxTree = None self.bracketMatch = None self.inString = None self.nextId = 0 self.outString = None def parseFile(self, path): inFile = open(path) self.parseString(inFile.read()) inFile.close() return self.nxTree def parseString(self, newickString, addImpliedRoots = True): self.nxTree = NXTree() self.inString = self.__filterWhitespace(newickString) self.__createBracketTable() self.nextId = 0 assert self.inString[-1] == ';' self.__addNode(0, len(self.inString)-1, None, addImpliedRoots) self.nxTree.isTree() return self.nxTree def writeString(self, nxTree = None): if nxTree: self.nxTree = nxTree self.outString = "" self.__writeNode(self.nxTree.getRootId(), None) self.outString += ";" return self.outString def writeFile(self, path, nxTree = None): outFile = open(path, "w") outFile.write(self.writeString(nxTree)) outFile.write("\n") outFile.close() return None #### PRIVATE WRITING FUNCTIONS #### def __writeNode(self, node, parent = None): children = self.nxTree.getChildren(node) if len(children) > 0: self.outString += "(" for child in children: self.__writeNode(child, node) if child != children[-1]: self.outString += "," if len(children) > 0: self.outString += ")" name = self.nxTree.getName(node) if len(name) > 0: containsSpace = True in [c1 in name for c1 in ws] if containsSpace: self.outString += "\"" self.outString += name if containsSpace: self.outString += "\"" if parent is not None: weight = self.nxTree.getWeight(parent, node, defaultValue=None) if weight is not None: self.outString += ":%s" % str(weight) #### PRIVATE READING FUNCTIONS #### def __filterWhitespace(self, newickString): filteredString = "" inQuote = False for c in newickString: if c == "\'" or c == "\"": inQuote = not inQuote elif inQuote or c not in ws: filteredString += c return filteredString def __createBracketTable(self): bracketStack = [] self.bracketMatch = dict() index = 0 for c in self.inString: if c == '(': bracketStack.append(index) elif c == ')': leftIndex = bracketStack.pop() self.bracketMatch[leftIndex] = index index += 1 assert len(bracketStack) == 0 def __childRanges(self, start, length): ranges = [] currentStart = start i = currentStart while i < start + length + 1: if self.inString[i] == ',' or i == start + length: ranges.append((currentStart, i - currentStart)) currentStart = i + 1 if i in self.bracketMatch: i = self.bracketMatch[i] i += 1 return ranges def __parseName(self, nameString): if nameString == ';': return ('','') tokens = nameString.split(':') assert len(tokens) == 1 or len(tokens) == 2 name = tokens[0] weight = '' if len(tokens) == 2: weight = tokens[1] return (name, weight) def __addNode(self, start, length, parent = None, addImpliedRoots = True): # parse the children (..,...,..) children = [] if self.inString[start] == '(': assert start in self.bracketMatch chLength = self.bracketMatch[start] - start - 1 children = self.__childRanges(start+1, chLength) start = self.bracketMatch[start] + 1 length -= (chLength + 2) # prase the name abc:123 name, weight = self.__parseName(self.inString[start:start+length]) id = self.nextId self.nextId += 1 self.nxTree.nxDg.add_node(id) if len(name) > 0: self.nxTree.nxDg.node[id]['name'] = name #update the graph if parent is not None: self.nxTree.nxDg.add_edge(parent, id) if len(weight) > 0: self.nxTree.nxDg[parent][id]['weight'] = float(weight) #update the root (implied roots are added as a new node) if self.nxTree.getRootId() is None: assert parent is None root = id if len(weight) > 0: if addImpliedRoots: root = self.nextId self.nextId += 1 self.nxTree.nxDg.add_edge(root, id) self.nxTree.setWeight(root,id, weight) self.nxTree.rootId = root # recurse on children for child in children: self.__addNode(child[0], child[1], id)
class SeqFile: branchLen = 1 def __init__(self, path=None): if path is not None: self.parseFile(path) def parseFile(self, path): if not os.path.isfile(path): raise RuntimeError("File not found: %s" % path) self.tree = None self.pathMap = dict() self.outgroups = [] seqFile = open(path, "r") for l in seqFile: line = l.strip() if line: if line[0] == "#": continue tokens = line.split() if self.tree is None and (len(tokens) == 1 or line[0] == '('): newickParser = NXNewick() try: self.tree = newickParser.parseString(line) except: raise RuntimeError("Failed to parse newick tree: %s" % line) elif len(tokens) > 0 and tokens[0] == '*': sys.stderr.write("Skipping line %s\n" % l) elif line[0] != '(' and len(tokens) >= 2: name = tokens[0] if name[0] == '*': name = name[1:] self.outgroups.append(name) path = string.join(tokens[1:]) if name in self.pathMap: raise RuntimeError("Duplicate name found: %s" % name) self.pathMap[name] = path elif len(tokens) > 0: sys.stderr.write("Skipping line %s\n" % l) if self.tree is None: self.starTree() self.cleanTree() self.validate() def starTree(self): self.tree = NXTree() label = 0 self.tree.nxDg.add_node(label) self.tree.rootId = label for name in self.pathMap.keys(): label += 1 self.tree.nxDg.add_edge(0, label) self.tree.setName(label, name) self.tree.setWeight(0, label, SeqFile.branchLen) def validate(self): if len([i for i in self.tree.postOrderTraversal()]) <= 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: raise RuntimeError("No sequence specified for %s" % name) else: path = self.pathMap[name] if not os.path.exists: raise RuntimeError("Sequence path not found: %s" % path) # remove leaves that do not have sequence data associated with them def cleanTree(self): numLeaves = 0 removeList = [] for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: removeList.append(node) numLeaves += 1 if numLeaves < 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") if len(removeList) == numLeaves: raise RuntimeError("No sequence path specified for any leaves in the tree") for leaf in removeList: sys.stderr.write("No sequence path found for %s: skipping\n" % ( self.tree.getName(leaf))) self.tree.removeLeaf(leaf) for node in self.tree.postOrderTraversal(): if self.tree.hasParent(node): parent = self.tree.getParent(node) if self.tree.getWeight(parent, node) is None: sys.stderr.write( "No branch length for %s: setting to %d\n" % ( self.tree.getName(node), SeqFile.branchLen)) self.tree.setWeight(parent, node, SeqFile.branchLen) # create the cactus_workflow_experiment xml element which serves as # the root node of the experiment template file needed by # cactus_createMultiCactusProject. Note the element is incomplete # until the cactus_disk child element has been added def toXMLElement(self): assert self.tree is not None elem = ET.Element("cactus_workflow_experiment") seqString = "" for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) path = self.pathMap[name] path.replace(" ", "\ ") seqString += absSymPath(path) + " " elem.attrib["sequences"] = seqString elem.attrib["species_tree"] = NXNewick().writeString(self.tree) elem.attrib["config"] = "defaultProgressive" return elem
def testRoot(self): for tree in self.trees: nxTree = NXTree(tree) rootId = nxTree.getRootId() assert rootId is not None assert nxTree.getParent(rootId) is None
class SeqFile: branchLen = 1 def __init__(self, path=None): if path is not None: self.parseFile(path) def parseFile(self, path): if not os.path.isfile(path): raise RuntimeError("File not found: %s" % path) self.tree = None self.pathMap = dict() self.outgroups = [] seqFile = open(path, "r") for l in seqFile: line = l.strip() if line: if line[0] == "#": continue tokens = line.split() if self.tree is None and (len(tokens) == 1 or line[0] == '('): newickParser = NXNewick() try: self.tree = newickParser.parseString(line) except: raise RuntimeError("Failed to parse newick tree: %s" % line) elif len(tokens) > 0 and tokens[0] == '*': sys.stderr.write("Skipping line %s\n" % l) elif line[0] != '(' and len(tokens) >= 2: name = tokens[0] if name[0] == '*': name = name[1:] self.outgroups.append(name) path = string.join(tokens[1:]) if name in self.pathMap: raise RuntimeError("Duplicate name found: %s" % name) self.pathMap[name] = path elif len(tokens) > 0: sys.stderr.write("Skipping line %s\n" % l) if self.tree is None: self.starTree() self.cleanTree() self.validate() def starTree(self): self.tree = NXTree() label = 0 self.tree.nxDg.add_node(label) self.tree.rootId = label for name in self.pathMap.keys(): label += 1 self.tree.nxDg.add_edge(0, label) self.tree.setName(label, name) self.tree.setWeight(0, label, SeqFile.branchLen) def validate(self): if len([i for i in self.tree.postOrderTraversal()]) <= 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: raise RuntimeError("No sequence specified for %s" % name) else: path = self.pathMap[name] #if not os.path.exists(path): # raise RuntimeError("Sequence path not found: %s" % path) #self.sanityCheckSequence(path) def sanityCheckSequence(self, path): """Warns the user about common problems with the input sequences.""" # Relies on cactus_analyseAssembly output staying in the # format it's currently in. cmdline = "cactus_analyseAssembly" if os.path.isdir(path): cmdline = "cat %s/* | %s -" % (path, cmdline) else: cmdline += " %s" % path output = popenCatch(cmdline) try: repeatMaskedFrac = float(re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1)) nFrac = float(re.search(r'ProportionNs: ([0-9.]*)', output).group(1)) except ValueError: # This can happen if the genome has 0 length, making the fractions NaN. # We warn the user but return afterwards, as the rest of the checks are # dependent on the fraction values. sys.stderr.write("WARNING: sequence path %s has 0 length. Consider " "removing it from your input file.\n\n" % path) return # These thresholds are pretty arbitrary, but should be good for # badly- to well-assembled vertebrate genomes. if repeatMaskedFrac > 0.70: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of masked bases: %f. progressiveCactus" " expects a soft-masked genome, i.e. all lowercase" " characters are considered masked. The process " "will proceed normally, but make sure you haven't " "accidentally provided an all-lowercase genome, " "in which case nothing will be aligned to " "it!\n\n" % (path, repeatMaskedFrac)) if nFrac > 0.30: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of 'N' bases: %f. The process will " "proceed normally, but make sure your genome " "isn't hard-masked! Alignments to hard-masked " "genomes are much worse than to soft-masked " "genomes. If the genome just has a lot of " "poorly assembled regions, feel free to " "ignore this message.\n\n" % (path, nFrac)) # remove leaves that do not have sequence data associated with them def cleanTree(self): numLeaves = 0 removeList = [] for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: removeList.append(node) numLeaves += 1 if numLeaves < 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") if len(removeList) == numLeaves: raise RuntimeError("No sequence path specified for any leaves in the tree") for leaf in removeList: sys.stderr.write("No sequence path found for %s: skipping\n" % ( self.tree.getName(leaf))) self.tree.removeLeaf(leaf) for node in self.tree.postOrderTraversal(): if self.tree.hasParent(node): parent = self.tree.getParent(node) if self.tree.getWeight(parent, node) is None: sys.stderr.write( "No branch length for %s: setting to %d\n" % ( self.tree.getName(node), SeqFile.branchLen)) self.tree.setWeight(parent, node, SeqFile.branchLen) # create the cactus_workflow_experiment xml element which serves as # the root node of the experiment template file needed by # cactus_createMultiCactusProject. Note the element is incomplete # until the cactus_disk child element has been added def toXMLElement(self): assert self.tree is not None elem = ET.Element("cactus_workflow_experiment") seqString = "" for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) path = self.pathMap[name] seqString += path + " " elem.attrib["sequences"] = seqString elem.attrib["species_tree"] = NXNewick().writeString(self.tree) elem.attrib["config"] = "defaultProgressive" return elem
class SeqFile: branchLen = 1 def __init__(self, path=None): if path is not None: self.parseFile(path) def parseFile(self, path): if not os.path.isfile(path): raise RuntimeError("File not found: %s" % path) self.tree = None self.pathMap = dict() self.outgroups = [] seqFile = open(path, "r") for l in seqFile: line = l.strip() if line: if line[0] == "#": continue tokens = line.split() if self.tree is None and (len(tokens) == 1 or line[0] == '('): newickParser = NXNewick() try: self.tree = newickParser.parseString(line) except: raise RuntimeError("Failed to parse newick tree: %s" % line) elif len(tokens) > 0 and tokens[0] == '*': sys.stderr.write("Skipping line %s\n" % l) elif line[0] != '(' and len(tokens) >= 2: name = tokens[0] if name[0] == '*': name = name[1:] self.outgroups.append(name) path = string.join(tokens[1:]) if name in self.pathMap: raise RuntimeError("Duplicate name found: %s" % name) self.pathMap[name] = path elif len(tokens) > 0: sys.stderr.write("Skipping line %s\n" % l) if self.tree is None: self.starTree() self.cleanTree() self.validate() def starTree(self): self.tree = NXTree() label = 0 self.tree.nxDg.add_node(label) self.tree.rootId = label for name in self.pathMap.keys(): label += 1 self.tree.nxDg.add_edge(0, label) self.tree.setName(label, name) self.tree.setWeight(0, label, SeqFile.branchLen) def validate(self): if len([i for i in self.tree.postOrderTraversal()]) <= 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: raise RuntimeError("No sequence specified for %s" % name) else: path = self.pathMap[name] #if not os.path.exists(path): # raise RuntimeError("Sequence path not found: %s" % path) #self.sanityCheckSequence(path) def sanityCheckSequence(self, path): """Warns the user about common problems with the input sequences.""" # Relies on cactus_analyseAssembly output staying in the # format it's currently in. cmdline = "cactus_analyseAssembly" if os.path.isdir(path): cmdline = "cat %s/* | %s -" % (path, cmdline) else: cmdline += " %s" % path output = popenCatch(cmdline) try: repeatMaskedFrac = float( re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1)) nFrac = float( re.search(r'ProportionNs: ([0-9.]*)', output).group(1)) except ValueError: # This can happen if the genome has 0 length, making the fractions NaN. # We warn the user but return afterwards, as the rest of the checks are # dependent on the fraction values. sys.stderr.write( "WARNING: sequence path %s has 0 length. Consider " "removing it from your input file.\n\n" % path) return # These thresholds are pretty arbitrary, but should be good for # badly- to well-assembled vertebrate genomes. if repeatMaskedFrac > 0.70: sys.stderr.write( "WARNING: sequence path %s has an extremely high " "proportion of masked bases: %f. progressiveCactus" " expects a soft-masked genome, i.e. all lowercase" " characters are considered masked. The process " "will proceed normally, but make sure you haven't " "accidentally provided an all-lowercase genome, " "in which case nothing will be aligned to " "it!\n\n" % (path, repeatMaskedFrac)) if nFrac > 0.30: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of 'N' bases: %f. The process will " "proceed normally, but make sure your genome " "isn't hard-masked! Alignments to hard-masked " "genomes are much worse than to soft-masked " "genomes. If the genome just has a lot of " "poorly assembled regions, feel free to " "ignore this message.\n\n" % (path, nFrac)) # remove leaves that do not have sequence data associated with them def cleanTree(self): numLeaves = 0 removeList = [] for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) if name not in self.pathMap: removeList.append(node) numLeaves += 1 if numLeaves < 2: raise RuntimeError("At least two valid leaf genomes required in" " input tree") if len(removeList) == numLeaves: raise RuntimeError( "No sequence path specified for any leaves in the tree") for leaf in removeList: sys.stderr.write("No sequence path found for %s: skipping\n" % (self.tree.getName(leaf))) self.tree.removeLeaf(leaf) for node in self.tree.postOrderTraversal(): if self.tree.hasParent(node): parent = self.tree.getParent(node) if self.tree.getWeight(parent, node) is None: sys.stderr.write( "No branch length for %s: setting to %d\n" % (self.tree.getName(node), SeqFile.branchLen)) self.tree.setWeight(parent, node, SeqFile.branchLen) # create the cactus_workflow_experiment xml element which serves as # the root node of the experiment template file needed by # cactus_createMultiCactusProject. Note the element is incomplete # until the cactus_disk child element has been added def toXMLElement(self): assert self.tree is not None elem = ET.Element("cactus_workflow_experiment") seqString = "" for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) path = self.pathMap[name] seqString += path + " " elem.attrib["sequences"] = seqString elem.attrib["species_tree"] = NXNewick().writeString(self.tree) elem.attrib["config"] = "defaultProgressive" return elem