def __init__(self): super(Phyloxml_Parser, self).__init__() self.phyloTree = PhyloTree() self.tagsOfInterest = { "clade": "", "name": "name", "branch_length": "length", "confidence": "bootstrap", "events": "events" }
def _parseNewickToJson(self, newickString, treeName=None, nameMap=None): """parses a newick representation of a tree into a PhyloTree data structure, which can be easily converted to json""" self.phyloTree = PhyloTree() newickString = self.cleanNewickString(newickString) if nameMap: newickString = self._mapName(newickString, nameMap) self.phyloTree.root = self.parseNode(newickString, 0) if nameMap: self.phyloTree.addAttributesToRoot({"treeName": treeName}) return self.phyloTree.generateJsonableDict()
def __init__(self): super(Phyloxml_Parser, self).__init__() self.phyloTree = PhyloTree() self.tagsOfInterest = { "clade": "", "name" : "name", "branch_length" : "length", "confidence" : "bootstrap", "events" : "events" }
class Newick_Parser(Base_Parser): """For parsing trees stored in the newick format (.nhx) It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well..""" def __init__(self): super(Newick_Parser, self).__init__() def parseFile(self, filePath): """Parses a newick file to obtain the string inside. Returns: jsonableDict""" with open(filePath, "r") as newickFile: newickString = newickFile.read() newickString = newickString.replace("\n", "").replace("\r", "") return [self.parseData(newickString)], "Success" def parseData(self, newickString): """To be called on a newickString directly to parse it. Returns: jsonableDict""" return self._parseNewickToJson(newickString) def _parseNewickToJson(self, newickString, treeName=None, nameMap=None): """parses a newick representation of a tree into a PhyloTree data structure, which can be easily converted to json""" self.phyloTree = PhyloTree() newickString = self.cleanNewickString(newickString) if nameMap: newickString = self._mapName(newickString, nameMap) self.phyloTree.root = self.parseNode(newickString, 0) if nameMap: self.phyloTree.addAttributesToRoot({"treeName": treeName}) return self.phyloTree.generateJsonableDict() def cleanNewickString(self, rawNewick): """removing semi colon, and illegal json characters (\,',") and white spaces""" return re.sub(r'\s|;|\"|\'|\\', '', rawNewick) def _makeNodesFromString(self, string, depth): """elements separated by comma could be empty""" if string.find("(") != -1: raise Exception("Tree is not well form, location: " + string) childrenString = string.split(",") childrenNodes = [] for childString in childrenString: if len(childString) == 0: continue nodeInfo = childString.split(":") name, length, bootstrap = "", None, -1 if len(nodeInfo) == 2: # has length info length = nodeInfo[1] # checking for bootstap values name = nodeInfo[0] try: # Nexus may bootstrap in names position name = float(name) if 0 <= name <= 1: bootstrap = name elif 1 <= name <= 100: bootstrap = name / 100 name = "" except ValueError: name = nodeInfo[0] else: name = nodeInfo[0] # string only contains name node = self.phyloTree.makeNode(name, length=length, depth=depth, bootstrap=bootstrap) childrenNodes += [node] return childrenNodes def _mapName(self, newickString, nameMap): """ Necessary to replace names of terms inside nexus representation Also, its here because Mailaud's doesnt deal with id_strings outside of quotes(" ") """ newString = "" start = 0 end = 0 for i in xrange(len(newickString)): if newickString[i] == "(" or newickString[i] == ",": if re.match(r"[,(]", newickString[i + 1:]): continue else: end = i + 1 # i now refers to the starting position of the term to be replaced, # we will next find j which is the ending pos of the term for j in xrange(i + 1, len(newickString)): enclosingSymbol = newickString[ j] # the immediate symbol after a common or left bracket which denotes the end of a term if enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",": termToReplace = newickString[end:j] newString += newickString[start:end] + nameMap[ termToReplace] #+ "'" "'" + start = j break newString += newickString[start:] return newString def parseNode(self, string, depth): """ Recursive method for parsing newick string, works by stripping down the string into substring of newick contained with brackers, which is used to call itself. Eg ... ( A, B, (D, E)C, F, G ) ... We will make the preceeding nodes first A, B, then the internal node C, its children D, E, and finally the succeeding nodes F, G """ # Base case where there is only an empty string if string == "": return # Base case there its only an internal claude if string.find("(") == -1: return self._makeNodesFromString(string, depth) nodes, children = [], [ ] # nodes refer to the nodes on this level, children refers to the child of the start = 0 lenOfPreceedingInternalNodeString = 0 bracketStack = [] for j in xrange(len(string)): if string[ j] == "(": #finding the positions of all the open brackets bracketStack.append(j) continue if string[ j] == ")": #finding the positions of all the closed brackets to extract claude i = bracketStack.pop() if len(bracketStack) == 0: # is child of current node InternalNode = None #First flat call to make nodes of the same depth but from the preceeding string. startSubstring = string[ start + lenOfPreceedingInternalNodeString:i] preceedingNodes = self._makeNodesFromString( startSubstring, depth) nodes += preceedingNodes # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it. if j + 1 < len(string): stringRightOfBracket = string[ j + 1:] # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3 match = re.search(r"[\)\,\(]", stringRightOfBracket) if match: indexOfNextSymbol = match.start() stringRepOfInternalNode = stringRightOfBracket[: indexOfNextSymbol] internalNodes = self._makeNodesFromString( stringRepOfInternalNode, depth) if len(internalNodes) > 0: InternalNode = internalNodes[0] lenOfPreceedingInternalNodeString = len( stringRepOfInternalNode) else: # sometimes the node can be the last element of a string InternalNode = self._makeNodesFromString( string[j + 1:], depth)[0] lenOfPreceedingInternalNodeString = len(string) - j if InternalNode == None: #creating a generic node if it is unnamed InternalNode = self.phyloTree.makeNode( "", depth=depth, isInternal=True) #"internal-" + str(depth) lenOfPreceedingInternalNodeString = 0 # recussive call to make the internal claude childSubString = string[i + 1:j] InternalNode.addChildNode( self.parseNode(childSubString, depth + 1)) nodes.append( InternalNode ) # we append the internal node later to preserve order start = j + 1 continue if depth == 0: # if its the root node, we do nothing about it and return return nodes[0] # Adding last most set of children endString = string[start:] if string[ start - 1] == ")": # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse match = re.search(r"[\)\,\(]", endString) if match: endOfNodeName = start + match.start() + 1 endString = string[endOfNodeName:] nodes += self._makeNodesFromString(endString, depth) return nodes
class Phyloxml_Parser(Base_Parser): """Parses a phyloxml file into a json file that will be passed to PhyloViz for display""" def __init__(self): super(Phyloxml_Parser, self).__init__() self.phyloTree = PhyloTree() self.tagsOfInterest = { "clade": "", "name" : "name", "branch_length" : "length", "confidence" : "bootstrap", "events" : "events" } def parseFile(self, filePath): """passes a file and extracts its Phylogeny Tree content.""" phyloXmlFile = open(filePath, "r") xmlTree = ElementTree.parse(phyloXmlFile) xmlRoot = xmlTree.getroot()[0] self.nameSpaceIndex = xmlRoot.tag.rfind("}") + 1 # used later by the clean tag method to remove the name space in every element.tag phyloRoot = None for child in xmlRoot: childTag = self.cleanTag(child.tag) if childTag == "clade": phyloRoot = child elif childTag == "name": self.phyloTree.title = child.text self.phyloTree.root = self.parseNode(phyloRoot, 0) jsonDict = self.phyloTree.generateJsonableDict() return [jsonDict], "Success" def parseNode(self, node, depth): """Parses any node within a phyloxml tree and looks out for claude, which signals the creation of nodes - internal OR leaf""" tag = self.cleanTag(node.tag) if not tag == "clade": return None hasInnerClade = False # peeking once for parent and once for child to check if the node is internal for child in node: childTag = self.cleanTag(child.tag) if childTag == "clade": hasInnerClade = True break if hasInnerClade: # this node is an internal node currentNode = self._makeInternalNode(node, depth=depth) for child in node: child = self.parseNode(child, depth + 1) if isinstance(child, Node): currentNode.addChildNode(child) else: # this node is a leaf node currentNode = self._makeLeafNode(node, depth=depth + 1) return currentNode def _makeLeafNode(self, leafNode, depth=0 ): """Makes leaf nodes by calling Phylotree methods""" node = {} for child in leafNode: childTag = self.cleanTag(child.tag) if childTag in self.tagsOfInterest: key = self.tagsOfInterest[childTag] # need to map phyloxml terms to ours node[key] = child.text node["depth"] = depth return self.phyloTree.makeNode(self._getNodeName(leafNode), **node) def _getNodeName(self, node, depth=-1): """Gets the name of a claude. It handles the case where a taxonomy node is involved""" def getTagFromTaxonomyNode(node): """Returns the name of a taxonomy node. A taxonomy node have to be treated differently as the name is embedded one level deeper""" phyloxmlTaxoNames = { "common_name" : "", "scientific_name" : "", "code" : "" } for child in node: childTag = self.cleanTag(child.tag) if childTag in phyloxmlTaxoNames: return child.text return "" nodeName = "" for child in node: childTag = self.cleanTag(child.tag) if childTag == "name" : nodeName = child.text break elif childTag == "taxonomy": nodeName = getTagFromTaxonomyNode(child) break return nodeName def _makeInternalNode(self, internalNode, depth=0): """ Makes an internal node from an element object that is guranteed to be a parent node. Gets the value of interests like events and appends it to a custom node object that will be passed to PhyloTree to make nodes """ node = {} for child in internalNode: childTag = self.cleanTag(child.tag) if childTag == "clade": continue elif childTag in self.tagsOfInterest: if childTag == "events": # events is nested 1 more level deeper than others key, text = "events", self.cleanTag(child[0].tag) else: key = self.tagsOfInterest[childTag] text = child.text node[key] = text return self.phyloTree.makeNode(self._getNodeName(internalNode, depth), **node) def cleanTag(self, tagString): return tagString[self.nameSpaceIndex:]
class Newick_Parser(Base_Parser): """For parsing trees stored in the newick format (.nhx) It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well..""" def __init__(self): super(Newick_Parser, self).__init__() def parseFile(self, filePath): """Parses a newick file to obtain the string inside. Returns: jsonableDict""" with open(filePath, "r") as newickFile: newickString = newickFile.read() newickString = newickString.replace("\n", "").replace("\r", "") return [self.parseData(newickString)], "Success" def parseData(self, newickString): """To be called on a newickString directly to parse it. Returns: jsonableDict""" return self._parseNewickToJson(newickString) def _parseNewickToJson(self, newickString, treeName=None, nameMap=None): """parses a newick representation of a tree into a PhyloTree data structure, which can be easily converted to json""" self.phyloTree = PhyloTree() newickString = self.cleanNewickString(newickString) if nameMap: newickString = self._mapName(newickString, nameMap) self.phyloTree.root = self.parseNode(newickString, 0) if nameMap: self.phyloTree.addAttributesToRoot({"treeName": treeName}) return self.phyloTree.generateJsonableDict() def cleanNewickString(self, rawNewick): """removing semi colon, and illegal json characters (\,',") and white spaces""" return re.sub(r'\s|;|\"|\'|\\', '', rawNewick) def _makeNodesFromString(self, string, depth): """elements separated by comma could be empty""" if string.find("(") != -1: raise Exception("Tree is not well form, location: " + string) childrenString = string.split(",") childrenNodes = [] for childString in childrenString: if len(childString) == 0: continue nodeInfo = childString.split(":") name, length, bootstrap = "", None, -1 if len(nodeInfo) == 2: # has length info length = nodeInfo[1] # checking for bootstap values name = nodeInfo[0] try: # Nexus may bootstrap in names position name = float(name) if 0<= name <= 1: bootstrap = name elif 1 <= name <= 100: bootstrap = name / 100 name = "" except ValueError: name = nodeInfo[0] else: name = nodeInfo[0] # string only contains name node = self.phyloTree.makeNode(name, length=length, depth=depth, bootstrap= bootstrap) childrenNodes += [node] return childrenNodes def _mapName(self, newickString, nameMap): """ Necessary to replace names of terms inside nexus representation Also, it's here because Mailaud's doesnt deal with id_strings outside of quotes(" ") """ newString = "" start = 0 end = 0 for i in xrange(len(newickString)): if newickString[i] == "(" or newickString[i] == ",": if re.match(r"[,(]", newickString[i+1:]): continue else: end = i + 1 # i now refers to the starting position of the term to be replaced, # we will next find j which is the ending pos of the term for j in xrange(i+1, len(newickString)): enclosingSymbol = newickString[j] # the immediate symbol after a common or left bracket which denotes the end of a term if enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",": termToReplace = newickString[end:j] newString += newickString[start : end] + nameMap[termToReplace] #+ "'" "'" + start = j break newString += newickString[start:] return newString def parseNode(self, string, depth): """ Recursive method for parsing newick string, works by stripping down the string into substring of newick contained with brackers, which is used to call itself. Eg ... ( A, B, (D, E)C, F, G ) ... We will make the preceeding nodes first A, B, then the internal node C, its children D, E, and finally the succeeding nodes F, G """ # Base case where there is only an empty string if string == "": return # Base case there it's only an internal claude if string.find("(") == -1: return self._makeNodesFromString(string, depth) nodes, children = [], [] # nodes refer to the nodes on this level, children refers to the child of the start = 0 lenOfPreceedingInternalNodeString = 0 bracketStack = [] for j in xrange(len(string)): if string[j] == "(": #finding the positions of all the open brackets bracketStack.append(j) continue if string[j] == ")": #finding the positions of all the closed brackets to extract claude i = bracketStack.pop() if len(bracketStack) == 0: # is child of current node InternalNode = None #First flat call to make nodes of the same depth but from the preceeding string. startSubstring = string[start + lenOfPreceedingInternalNodeString: i] preceedingNodes = self._makeNodesFromString(startSubstring, depth) nodes += preceedingNodes # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it. if j + 1 < len(string): stringRightOfBracket = string[j+1:] # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3 match = re.search(r"[\)\,\(]", stringRightOfBracket) if match: indexOfNextSymbol = match.start() stringRepOfInternalNode = stringRightOfBracket[:indexOfNextSymbol] internalNodes = self._makeNodesFromString( stringRepOfInternalNode, depth) if len(internalNodes) > 0: InternalNode = internalNodes[0] lenOfPreceedingInternalNodeString = len(stringRepOfInternalNode) else: # sometimes the node can be the last element of a string InternalNode = self._makeNodesFromString(string[j+1:], depth)[0] lenOfPreceedingInternalNodeString = len(string) - j if InternalNode == None: #creating a generic node if it is unnamed InternalNode = self.phyloTree.makeNode( "", depth=depth, isInternal=True ) #"internal-" + str(depth) lenOfPreceedingInternalNodeString = 0 # recussive call to make the internal claude childSubString = string[ i + 1 : j ] InternalNode.addChildNode(self.parseNode(childSubString, depth + 1)) nodes.append(InternalNode) # we append the internal node later to preserve order start = j + 1 continue if depth == 0: # if it's the root node, we do nothing about it and return return nodes[0] # Adding last most set of children endString = string[start:] if string[start-1] == ")": # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse match = re.search(r"[\)\,\(]", endString) if match: endOfNodeName = start + match.start() + 1 endString = string[endOfNodeName:] nodes += self._makeNodesFromString(endString, depth) return nodes
class Phyloxml_Parser(Base_Parser): """Parses a phyloxml file into a json file that will be passed to PhyloViz for display""" def __init__(self): super(Phyloxml_Parser, self).__init__() self.phyloTree = PhyloTree() self.tagsOfInterest = { "clade": "", "name": "name", "branch_length": "length", "confidence": "bootstrap", "events": "events" } def parseFile(self, filePath): """passes a file and extracts its Phylogeny Tree content.""" phyloXmlFile = open(filePath, "r") xmlTree = ElementTree.parse(phyloXmlFile) xmlRoot = xmlTree.getroot()[0] self.nameSpaceIndex = xmlRoot.tag.rfind( "}" ) + 1 # used later by the clean tag method to remove the name space in every element.tag phyloRoot = None for child in xmlRoot: childTag = self.cleanTag(child.tag) if childTag == "clade": phyloRoot = child elif childTag == "name": self.phyloTree.title = child.text self.phyloTree.root = self.parseNode(phyloRoot, 0) jsonDict = self.phyloTree.generateJsonableDict() return [jsonDict], "Success" def parseNode(self, node, depth): """Parses any node within a phyloxml tree and looks out for claude, which signals the creation of nodes - internal OR leaf""" tag = self.cleanTag(node.tag) if not tag == "clade": return None hasInnerClade = False # peeking once for parent and once for child to check if the node is internal for child in node: childTag = self.cleanTag(child.tag) if childTag == "clade": hasInnerClade = True break if hasInnerClade: # this node is an internal node currentNode = self._makeInternalNode(node, depth=depth) for child in node: child = self.parseNode(child, depth + 1) if isinstance(child, Node): currentNode.addChildNode(child) else: # this node is a leaf node currentNode = self._makeLeafNode(node, depth=depth + 1) return currentNode def _makeLeafNode(self, leafNode, depth=0): """Makes leaf nodes by calling Phylotree methods""" node = {} for child in leafNode: childTag = self.cleanTag(child.tag) if childTag in self.tagsOfInterest: key = self.tagsOfInterest[ childTag] # need to map phyloxml terms to ours node[key] = child.text node["depth"] = depth return self.phyloTree.makeNode(self._getNodeName(leafNode), **node) def _getNodeName(self, node, depth=-1): """Gets the name of a claude. It handles the case where a taxonomy node is involved""" def getTagFromTaxonomyNode(node): """Returns the name of a taxonomy node. A taxonomy node have to be treated differently as the name is embedded one level deeper""" phyloxmlTaxoNames = { "common_name": "", "scientific_name": "", "code": "" } for child in node: childTag = self.cleanTag(child.tag) if childTag in phyloxmlTaxoNames: return child.text return "" nodeName = "" for child in node: childTag = self.cleanTag(child.tag) if childTag == "name": nodeName = child.text break elif childTag == "taxonomy": nodeName = getTagFromTaxonomyNode(child) break return nodeName def _makeInternalNode(self, internalNode, depth=0): """ Makes an internal node from an element object that is guranteed to be a parent node. Gets the value of interests like events and appends it to a custom node object that will be passed to PhyloTree to make nodes """ node = {} for child in internalNode: childTag = self.cleanTag(child.tag) if childTag == "clade": continue elif childTag in self.tagsOfInterest: if childTag == "events": # events is nested 1 more level deeper than others key, text = "events", self.cleanTag(child[0].tag) else: key = self.tagsOfInterest[childTag] text = child.text node[key] = text return self.phyloTree.makeNode(self._getNodeName(internalNode, depth), **node) def cleanTag(self, tagString): return tagString[self.nameSpaceIndex:]