Ejemplo n.º 1
0
    def taxid2tree(self, taxid_list, out_fmt="newick"):
        """ This function take a list of gi as input, will generate a path for
            for each gi, then construct a newick or phyloxml tree based on these
            gi pathes.

            out_fmt = newick / phyloxml ...
        """
        treeFile = StringIO()

        # get pathes for a list of taxid
        path_list =[";".join([str(item) for item in self.get_path(taxid)])
                    for taxid in taxid_list ]

        # read in pathFile, and store node info into nodes
        nodes = {} # data format {"node_name": Clade_object}
        root = None

        # to parese path iterately
        for i, path in enumerate(path_list):
            line = path.strip().split(";")
            if root is None:
                root = line[0]
            else:
                assert root == line[0], "The %d-th line is from a different root"%(i+1)

            # check node iterately, first reverse list, to from leaf to root
            # to make sure every node has a parent node
            leaf2root = line[::-1]

            for j, item in enumerate(leaf2root):
                # find child_node and parent_node, root node's parent is itself
                if j == len(line)-1:
                    child_node = item; parent_node=item
                else:
                    child_node = item; parent_node = leaf2root[j+1]

                if nodes.has_key(child_node):
                    continue
                else:
                    # add this node
                    nodes[child_node] = Newick.Clade(name=child_node)
                    # add its parent info
                    nodes[child_node].parent = parent_node

        for node_name, node_clade in nodes.iteritems():
            # find the root node, its parent is itself
            if node_name == node_clade.parent:
                root_node = node_clade
                print "root node is %s, constructing tree ..."%(str(node_name))
            # if node is not root, then find its parent, and add to its parent's clades
            else:
                parent_node = nodes[node_clade.parent]
                parent_node.clades.append(node_clade)
            del node_clade.parent

        tree = Newick.Tree(root = root_node)
        bp.write(tree, treeFile, out_fmt)
        treeStr = treeFile.getvalue()
        return treeStr
Ejemplo n.º 2
0
	def test_simple_manual_tree(self):
		"""Manually constructed tree"""
		root = Newick.Clade()
		root.name = "root"
		child = Newick.Clade()
		child.name = "kid1"
		root.clades.append(child)
		child = Newick.Clade()
		child.name = "kid2"
		root.clades.append(child)
		phyloutil.printTree(root, sys.stdout)
Ejemplo n.º 3
0
 def node2clade(nxtree, node):
     subclades = [node2clade(nxtree, nxtree.node(n)) for n in node.succ]
     return Newick.Clade(branch_length=node.data.branchlength,
                         name=node.data.taxon,
                         clades=subclades,
                         confidence=node.data.support,
                         comment=node.data.comment)
Ejemplo n.º 4
0
 def new_clade(self, parent=None):
     """Returns a new Newick.Clade, optionally with a temporary reference
     to its parent clade."""
     clade = Newick.Clade()
     if parent:
         clade.parent = parent
     return clade
Ejemplo n.º 5
0
def parse(handle):
    """Parse the trees in a Nexus file.

    Uses the old Nexus.Trees parser to extract the trees, converts them back to
    plain Newick trees, and feeds those strings through the new Newick parser.
    This way we don't have to modify the Nexus module yet. (Perhaps we'll
    eventually change Nexus to use the new NewickIO parser directly.)
    """
    nex = Nexus.Nexus(handle)

    # NB: Once Nexus.Trees is modified to use Tree.Newick objects, do this:
    # return iter(nex.trees)
    # Until then, convert the Nexus.Trees.Tree object hierarchy:
    def node2clade(nxtree, node):
        subclades = [node2clade(nxtree, nxtree.node(n)) for n in node.succ]
        return Newick.Clade(branch_length=node.data.branchlength,
                            name=node.data.taxon,
                            clades=subclades,
                            confidence=node.data.support,
                            comment=node.data.comment)

    for nxtree in nex.trees:
        newroot = node2clade(nxtree, nxtree.node(nxtree.root))
        yield Newick.Tree(root=newroot,
                          rooted=nxtree.rooted,
                          name=nxtree.name,
                          weight=nxtree.weight)
Ejemplo n.º 6
0
	def test_print(self):
		"""Print tree"""
		root = Newick.Clade()
		#root.parent = None
		root.name = 'root'
		buildTree(3,2,root,'root')
		phyloutil.printTree(root, sys.stdout)
		print(root)
Ejemplo n.º 7
0
def buildTree(depth, breadth, parent, id):
	if depth>0:
		for i in range(breadth): # binary
			node = Newick.Clade()
			node.name = "{}.{}".format(id,i+1)
			parent.clades.append(node)
			#print("Added node level {}".format(depth))
			buildTree(depth-1, breadth, node, node.name)
Ejemplo n.º 8
0
	def test_reading_from_guide_table(self):
		"""Read table"""
		root = Newick.Clade()
		root.name = "cellular organisms"
		#print(root.depths())
		inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r')
		table = util.readTable(inf, header=True)
		#print(table)
		tree = phyloutil.treeFromClassificationTable(table)
		inf.close()
		phyloutil.printTree(tree, sys.stdout)
Ejemplo n.º 9
0
	def test_search(self):
		"""Find by name"""
		root = Newick.Clade()
		#root.parent = None
		root.name = 'root'
		buildTree(3,2,root,'root')
		#for n in BaseTree._level_traverse(root, lambda x: x.clades):
		#	print(n)
		target_name = "root.1.2.1"
		target = phyloutil.findNodeByName(target_name, root)
		self.assertTrue(target.name == target_name)
Ejemplo n.º 10
0
def treeFromClassificationTable(table):
	names = table['name']
	names.reverse()
	cur_child = None
	node = None
	for name in names:
		node = Newick.Clade()
		node.name = name
		if not cur_child is None:
			node.clades.append(cur_child)
		cur_child = node
	return node
Ejemplo n.º 11
0
	def test_reading_from_class_table(self):
		"""Read table"""
		root = Newick.Clade()
		root.name = "cellular organisms"
		inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r')
		table = util.readTable(inf, header=True)
		#print(table)
		tree = phyloutil.treeFromClassificationTable(table)
		inf.close()
		#phyloutil.printTree(root, sys.stdout)
		termlist = list(tree.get_terminals())
		self.assertTrue(termlist[0].name=='Moesziomyces antarcticus T-34')
Ejemplo n.º 12
0
    def _parse_tag(self, text):
        """Extract the data for a node from text.

        :returns: Clade instance containing any available data
        """
        # Extract the comment
        comment_start = text.find(NODECOMMENT_START)
        if comment_start != -1:
            comment_end = text.find(NODECOMMENT_END)
            if comment_end == -1:
                raise NewickError('Error in tree description: '
                                  'Found %s without matching %s' %
                                  (NODECOMMENT_START, NODECOMMENT_END))
            comment = text[comment_start + len(NODECOMMENT_START):comment_end]
            text = text[:comment_start] + text[comment_end +
                                               len(NODECOMMENT_END):]
        else:
            comment = None
        clade = Newick.Clade(comment=comment)
        # Extract name (taxon), and optionally support, branch length
        # Float values are support and branch length, the string is name/taxon
        values = []
        for part in (t.strip() for t in text.split(':')):
            if part:
                try:
                    values.append(float(part))
                except ValueError:
                    assert clade.name is None, "Two string taxonomies?"
                    clade.name = part
        if len(values) == 1:
            # Real branch length, or support as branch length
            if self.values_are_confidence:
                clade.confidence = values[0]
            else:
                clade.branch_length = values[0]
        elif len(values) == 2:
            # Two non-taxon values: support comes first. (Is that always so?)
            clade.confidence, clade.branch_length = values
        elif len(values) > 2:
            raise NewickError("Too many colons in tag: " + text)
        return clade
Ejemplo n.º 13
0
 def _parse_tree(self, text):
     """Parses the text representation into an Tree object."""
     # XXX what global info do we have here? Any? Use **kwargs?
     return Newick.Tree(root=self._parse_subtree(text))
Ejemplo n.º 14
0
    def _parse_tree(self, text):
        """Parses the text representation into an Tree object."""
        tokens = re.finditer(tokenizer, text.strip())

        new_clade = self.new_clade
        root_clade = new_clade()

        current_clade = root_clade
        entering_branch_length = False

        lp_count = 0
        rp_count = 0
        for match in tokens:
            token = match.group()

            if token.startswith("'"):
                # quoted label; add characters to clade name
                current_clade.name = token[1:-1]

            elif token.startswith('['):
                # comment
                current_clade.comment = token[1:-1]
                if self.comments_are_confidence:
                    # Try to use this comment as a numeric support value
                    current_clade.confidence = _parse_confidence(current_clade.comment)

            elif token == '(':
                # start a new clade, which is a child of the current clade
                current_clade = new_clade(current_clade)
                entering_branch_length = False
                lp_count += 1

            elif token == ',':
                # if the current clade is the root, then the external parentheses
                # are missing and a new root should be created
                if current_clade is root_clade:
                    root_clade = new_clade()
                    current_clade.parent = root_clade
                # start a new child clade at the same level as the current clade
                parent = self.process_clade(current_clade)
                current_clade = new_clade(parent)
                entering_branch_length = False

            elif token == ')':
                # done adding children for this parent clade
                parent = self.process_clade(current_clade)
                if not parent:
                    raise NewickError('Parenthesis mismatch.')
                current_clade = parent
                entering_branch_length = False
                rp_count += 1

            elif token == ';':
                break

            elif token.startswith(':'):
                # branch length or confidence
                value = float(token[1:])
                if self.values_are_confidence:
                    current_clade.confidence = value
                else:
                    current_clade.branch_length = value

            elif token == '\n':
                pass

            else:
                # unquoted node label
                current_clade.name = token

        if not lp_count == rp_count:
            raise NewickError('Number of open/close parentheses do not match.')

        # if ; token broke out of for loop, there should be no remaining tokens
        try:
            next_token = next(tokens)
            raise NewickError('Text after semicolon in Newick tree: %s'
                              % next_token.group())
        except StopIteration:
            pass

        self.process_clade(current_clade)
        self.process_clade(root_clade)
        return Newick.Tree(root=root_clade, rooted=self.rooted)
Ejemplo n.º 15
0
 def _parse_tree(self, text, rooted):
     """Parses the text representation into an Tree object."""
     # XXX Pass **kwargs along from Parser.parse?
     return Newick.Tree(root=self._parse_subtree(text), rooted=self.rooted)
Ejemplo n.º 16
0
		data_outs.addStream(sys.stdout)

	# Write out parameters
	data_outs.write("# Run started {}\n".format(util.timestamp()))
	data_outs.write("# Command: {}\n".format(' '.join(sys.argv)))
	data_outs.write("# Parameters:\n")
	optdict = vars(options)
	for (k,v) in optdict.items():
		data_outs.write("#\t{k}: {v}\n".format(k=k, v=v))

	# Read input
	fname =os.path.expanduser(options.in_fname)
	if not os.path.isfile(fname):
	 	raise IOError("# Error: file {} does not exist".format(fname))

	tree_root = Newick.Clade()
	tree_root.parent = None
	tree_root.name = "cellular organisms"


	# Get directory of guide file
	path = os.path.dirname(fname)
	curwd = os.getcwd()

	species_names = []
	with open(fname,'r') as inf:
		os.chdir(path)
		tab = util.readTable(inf, header=True)
		rows = tab.dictrows
		if options.debug:
			rows = [x for x in tab.dictrows][:2]
Ejemplo n.º 17
0
    def path2newick(self, path2pathFile, node_fmt="taxid", out_fmt="newick"):
        """ This function take taxonomic path file as input, path should be consist
            of taxonomic id, not scitific name, because some scientific name are the
            same in different rank, but ids are unique.

            node_fmt = taxid / sciName

            out_fmt = newick / phyloxml ...

        """
        path, fileName = os.path.split(path2pathFile)
        basename = os.path.splitext(fileName)[0]
        outFile = os.path.join(path,
                               basename + "2tree_" + node_fmt + "." + out_fmt)

        with open(path2pathFile, "r") as pathFile:

            # read in pathFile, and store node info into nodes
            nodes = {}  # data format {"node_name": Clade_object}
            root = None

            # open file to parese line iterately
            for i, line in enumerate(pathFile):
                line = line.strip()
                if line.endswith(";"):
                    line = line.rstrip(";")
                line = line.strip().split(";")
                if root is None:
                    root = line[1]
                else:
                    assert root == line[
                        1], "The %d-th line is from a different root" % (i + 1)

                # check node iterately, first reverse list, to from leaf to root
                # to make sure every node has a parent node
                leaf2root = line[::-1]

                for j, item in enumerate(leaf2root):
                    # find child_node and parent_node, root node's parent is itself
                    if j == len(line) - 1:
                        child_node = item
                        parent_node = item
                    else:
                        child_node = item
                        parent_node = leaf2root[j + 1]

                    if nodes.has_key(child_node):
                        continue
                    else:
                        # add this node
                        nodes[child_node] = Newick.Clade(name=child_node)
                        # add its parent info
                        nodes[child_node].parent = parent_node

            for node_name, node_clade in nodes.iteritems():
                # find the root node, its parent is itself
                if node_name == node_clade.parent:
                    root_node = node_clade
                    print node_clade
                    print "root node found!! "
                # if node is not root, then find its parent, and add to its parent's clades
                else:
                    parent_node = nodes[node_clade.parent]
                    parent_node.clades.append(node_clade)
                del node_clade.parent

            # transform between output node format
            if node_fmt == "taxid":
                tree = Newick.Tree(root=root_node)
            else:
                assert node_fmt == "sciName", "The node_fmt should be taxid or sciName"
                # convert taxid to sciName
                for node_name, node in nodes.iteritems():
                    node_name = self.get_sciName(node_name)
                    for child in node.clades:
                        if child:
                            child.name = self.get_sciName(child.name)
                root_node.name = self.get_sciName(root_node.name)
                tree = Newick.Tree(root=root_node)

            # write tree to file
            print 'Writing %s tree to %s...' % (out_fmt, outFile)

            bp.write(tree, outFile, out_fmt)