# Loads newick tree
phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1])

# Set basic tree info as a phyloxml phylogeny object
phylo.phyloxml_phylogeny.set_name("test_tree")
if len(phylo.children) <= 2:
    phylo.phyloxml_phylogeny.set_rooted("true")
else:
    phylo.phyloxml_phylogeny.set_rooted("false")
    
# Add the tree to the phyloxml project
project.add_phylogeny(phylo)

# Export phyloxml document
OUTPUT = StringIO()
project.export(OUTPUT)

# Some ad-hoc changes to the phyloxml formatted document to meet the schema definition
text = OUTPUT.getvalue()
text = text.replace("phy:", "")
text = re.sub('branch_length_attr="[^"]+"', "", text)
header = """
 <phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd"
 xmlns="http://www.phyloxml.org">
"""
text = re.sub('<Phyloxml[^>]+>', header, text)
text = text.replace('Phyloxml', 'phyloxml')

# Save result 
open(sys.argv[1]+".phyloxml", "w").write(text)
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):
	
	#Make sure names are unique
	names = org_names
	for name in names:
		if names.count(name)>1:
			temp_name = name
			i=1
			for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup
				names[names.index(temp_name)] = temp_name + "_" + str(i)
				i = i +1
		
	#Normalize the x vector
	x = map(lambda y: y/sum(x),x)
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])

	#Make the list of distances (ave of the two ckm matrices)
	ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm
	ckm_ave_train_dist = dict()
	for i in range(len(org_names)):
		ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))]

	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)

	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)

	#Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
	def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
		dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name])
		nearby_indicies = list()
		#Add all the organisms that are within 0.05 of the given percent
	#	for i in range(len(dists)):
	#		if dists[i]<=.05:
	#			nearby_indicies.append(i)
		nearby_names = list()
		#If there are no nearby indicies, add the closest organism to the given percent
		if nearby_indicies==[]:
			nearby_names.append(org_names[dists.index(min(dists))])
		else:
			for i in range(len(nearby_indicies)):
				nearby_names.append(org_names[i])
		mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names))
		nearby_names.append(leaf_name)
		LCA = t.get_common_ancestor(nearby_names)
		LCA_to_leaf_dist = t.get_distance(LCA,leaf_name)
		#divide the dist to the right/left of the LCA node by the number of percentage points in there
		if LCA.name==t.name:
			percent_dist = percent*LCA_to_leaf_dist
			if mean_dist <= percent:
				child_node = (t&leaf_name)
			else:
				child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy
			ancestor_node = (t&child_node.name).up
		elif mean_dist <= percent:
			percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		else:
			percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))

	#Set outgroup
	if outgroup in names:
		t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names...
	else:
		print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ")
		print(names)
		print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.")

	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs]
	for i in range(len(org_names)):
		xi = x[i:len(x):len(org_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names)
				hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names

	size_factor=250
	font_size=55

	#Now put the bubbles on the nodes
	def layout(node):
		node_style = NodeStyle()
		node_style["hz_line_width"] = 10
		node_style["vt_line_width"] = 10
		node.set_style(node_style)
		#print(node)
		if node.is_leaf():
			if node.name in org_names:
				#make reconstructed bubble
				size = x[org_names.index(node.name)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#Denote that this was a training organism
				nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in org_names:
				idx = hyp_node_names[node.name][2]
				size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#This is if I want the names of the hypothetical nodes to be printed as well
				#nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				#faces.add_face_to_node(nameFace, node, 0, position="branch-right")
			else:
				size=0
		else:
			size=0
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	#ts.mode = "c"
	ts.scale = 2*1000
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50
	F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere")
	F.border.width = None
	F.opacity = 0.6
	ts.legend.add_face(F,0)
	ts.legend.add_face(TextFace("  Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1)
	ts.legend.add_face(TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1)
	ts.legend_position=4
	#t.show(tree_style=ts)
	t.render(outfile, w=550, units="mm", tree_style=ts)
	
	#Redner the XML file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	project.add_phylogeny(phylo)
	project.export(open(outfilexml,'w'))
Exemple #3
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	plot_rectangular = False
	common_kmer_data_path=''
	taxonomic_names_on_leaves = False
	try:
		opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputCommonKmerXFile"):
			input_file = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
		elif opt in ("-D", "--CommonKmerDataPath"):
			common_kmer_data_path = arg
		elif opt in ("-r", "--Rectangular"):
			plot_rectangular = True
		elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
			taxonomic_names_on_leaves = True
	
	
	#Read in the x vector
	fid = open(input_file,'r')
	x = map(lambda y: float(y),fid.readlines())
	fid.close()
	
	#Normalize the x vector
	#x = map(lambda y: y/sum(x),x)
	
	#Read in the taxonomy
	taxonomy = list()
	fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r')
	for line in fid:
		taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID)
	fid.close()
	
	#Read in the basis for the ckm matrices
	x_file_names = list()
	fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r')
	for line in fid:
		x_file_names.append(os.path.basename(line.strip()))
	fid.close()
	
	#Read in the common kmer matrix
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r')
	ckm30=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r')
	ckm50=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	names = x_file_names
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])
	
	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)
	
	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)
	
	#Function to insert a node some % along a branch
	def insert_hyp_node(t, leaf_name, percent):
		total_dist = t.get_distance(t.name,leaf_name)
		percent_dist = percent*total_dist
		child_node = (t&leaf_name)
		ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))
	
	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = map(lambda y: y**1.5,cutoffs)
	for i in range(len(x_file_names)):
		xi = x[i:len(x):len(x_file_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, x_file_names[i], cutoffs[j-1])
				hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names
				#insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])
	
	#Now put the bubbles on the nodes
	def layout(node):
		#print(node)
		if node.is_leaf():
			if node.name in x_file_names:
				#make reconstructed bubble
				size = x[x_file_names.index(node.name)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				if taxonomic_names_on_leaves:
					nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)])
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
				else:
					nameFace = AttrFace("name", fsize=25, fgcolor='black')
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in x_file_names:
				idx = hyp_node_names[node.name][2]
				size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#print node
				#print size
			else:
				size=0
		else:
			size=0
		#print(size)
	
	ts = TreeStyle()
	ts.layout_fn = layout
	if plot_rectangular:
		ts.mode = "r"
	else:
		ts.mode = "c"
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50

	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
Exemple #4
0
#          /-iajom
#     /---|
#    |     \-wiszh
#----|
#    |     /-xrygw
#     \---|
#         |     /-gjlwx
#          \---|
#               \-ijvnk

# Trees can be operated as normal ETE trees
phylo.show()


# Export the project as phyloXML format
project.export()

# <phy:Phyloxml xmlns:phy="http://www.phyloxml.org/1.10/phyloxml.xsd">
#     <phy:phylogeny>
#         <phy:name>test_tree</phy:name>
#         <phy:clade>
#             <phy:name>NoName</phy:name>
#             <phy:branch_length>0.000000e+00</phy:branch_length>
#             <phy:confidence type="branch_support">1.0</phy:confidence>
#             <phy:clade>
#                 <phy:name>NoName</phy:name>
#                 <phy:branch_length>1.665083e-01</phy:branch_length>
#                 <phy:confidence type="branch_support">0.938507980435</phy:confidence>
#                 <phy:clade>
#                     <phy:name>NoName</phy:name>
#                     <phy:branch_length>1.366655e-01</phy:branch_length>
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):

    #Make sure names are unique
    names = org_names
    for name in names:
        if names.count(name) > 1:
            temp_name = name
            i = 1
            for dummy in range(
                    0,
                    names.count(name) - 1
            ):  #Don't change the last one, just to make sure we don't conflict with the outgroup
                names[names.index(temp_name)] = temp_name + "_" + str(i)
                i = i + 1

    #Normalize the x vector
    x = map(lambda y: y / sum(x), x)
    ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30))
    ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50))
    num_rows = ckm30_norm.shape[0]
    num_cols = ckm30_norm.shape[1]
    matrix = list()
    for i in range(num_rows):
        matrix.append([
            .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 *
            (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i])
            for j in range(i + 1)
        ])

    #Make the list of distances (ave of the two ckm matrices)
    ckm_ave_train = .5 * ckm30_norm + .5 * ckm50_norm
    ckm_ave_train_dist = dict()
    for i in range(len(org_names)):
        ckm_ave_train_dist[org_names[i]] = [
            .5 * ckm_ave_train[i, j] + .5 * ckm_ave_train[j, i]
            for j in range(len(org_names))
        ]

    #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
    dm = _DistanceMatrix(names, matrix)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    t = Tree(tree.format('newick'), format=1)

    #tree.format('newick')
    #Phylo.draw_ascii(tree)

    #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
    #Function to insert a node at a given distance
    def insert_node(t, name_to_insert, insert_above, dist_along):
        insert_at_node = t.search_nodes(name=insert_above)[0]
        parent = (t & insert_above).up
        orig_branch_length = t.get_distance(insert_at_node, parent)
        if orig_branch_length < dist_along:
            raise ValueError(
                "error: dist_along larger than orig_branch_length in PlotPackage.py"
            )
        removed_node = insert_at_node.detach()
        removed_node.dist = orig_branch_length - dist_along
        added_node = parent.add_child(name=name_to_insert, dist=dist_along)
        added_node.add_child(removed_node)

    #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
    def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
        dists = map(lambda y: abs(y - percent), ckm_ave_train_dist[leaf_name])
        nearby_indicies = list()
        #Add all the organisms that are within 0.05 of the given percent
        #	for i in range(len(dists)):
        #		if dists[i]<=.05:
        #			nearby_indicies.append(i)
        nearby_names = list()
        #If there are no nearby indicies, add the closest organism to the given percent
        if nearby_indicies == []:
            nearby_names.append(org_names[dists.index(min(dists))])
        else:
            for i in range(len(nearby_indicies)):
                nearby_names.append(org_names[i])
        mean_dist = np.mean(
            map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],
                nearby_names))
        nearby_names.append(leaf_name)
        LCA = t.get_common_ancestor(nearby_names)
        LCA_to_leaf_dist = t.get_distance(LCA, leaf_name)
        #divide the dist to the right/left of the LCA node by the number of percentage points in there
        if LCA.name == t.name:
            percent_dist = percent * LCA_to_leaf_dist
            if mean_dist <= percent:
                child_node = (t & leaf_name)
            else:
                child_node = (
                    t & nearby_names[0]
                )  #This means "go up from root" in the direction of the nearest guy
            ancestor_node = (t & child_node.name).up
        elif mean_dist <= percent:
            percent_dist = t.get_distance(LCA) + abs(percent - mean_dist) * (
                LCA_to_leaf_dist) / (1 - mean_dist)
            child_node = (t & leaf_name)
            ancestor_node = (t & child_node.name).up
        else:
            percent_dist = t.get_distance(LCA) - abs(percent - mean_dist) * (
                t.get_distance(LCA)) / (mean_dist)
            child_node = (t & leaf_name)
            ancestor_node = (t & child_node.name).up
        while t.get_distance(t.name, ancestor_node) > percent_dist:
            child_node = ancestor_node
            ancestor_node = (t & child_node.name).up
        insert_node(t, leaf_name + "_" + str(percent), child_node.name,
                    percent_dist - t.get_distance(t.name, ancestor_node))

    #Set outgroup
    if outgroup in names:
        t.set_outgroup(
            t & outgroup
        )  #I will need to check that this outgroup is actually one of the names...
    else:
        print("WARNING: the chosen outgroup " + outgroup +
              " is not in the given taxonomy: ")
        print(names)
        print(
            "Proceeding without setting an outgroup. This may cause results to be uninterpretable."
        )

    #Insert hypothetical nodes
    hyp_node_names = dict()
    cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1]
    cutoffs = [
        -.5141 * (val**3) + 1.0932 * (val**2) + 0.3824 * val for val in cutoffs
    ]
    for i in range(len(org_names)):
        xi = x[i:len(x):len(org_names)]
        for j in range(1, len(cutoffs) + 1):
            if xi[j] > 0:
                insert_hyp_node(t, org_names[i], cutoffs[j - 1],
                                ckm_ave_train_dist, org_names)
                hyp_node_names[org_names[i] + "_" + str(cutoffs[j - 1])] = [
                    org_names[i], cutoffs[j - 1], j - 1
                ]  #in case there are "_" in the file names

    size_factor = 250
    font_size = 55

    #Now put the bubbles on the nodes
    def layout(node):
        node_style = NodeStyle()
        node_style["hz_line_width"] = 10
        node_style["vt_line_width"] = 10
        node.set_style(node_style)
        #print(node)
        if node.is_leaf():
            if node.name in org_names:
                #make reconstructed bubble
                size = x[org_names.index(node.name)]
                F = CircleFace(radius=size_factor * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #Denote that this was a training organism
                nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
                faces.add_face_to_node(nameFace,
                                       node,
                                       0,
                                       position="branch-right")
        elif node.name in hyp_node_names:  #Otherwise it's a hypothetical node, just use recon x
            node_base_name = hyp_node_names[node.name][0]
            percent = hyp_node_names[node.name][1]
            if node_base_name in org_names:
                idx = hyp_node_names[node.name][2]
                size = x[org_names.index(node_base_name) +
                         (idx + 1) * len(org_names)]
                F = CircleFace(radius=size_factor * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #This is if I want the names of the hypothetical nodes to be printed as well
                #nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
                #faces.add_face_to_node(nameFace, node, 0, position="branch-right")
            else:
                size = 0
        else:
            size = 0

    ts = TreeStyle()
    ts.layout_fn = layout
    ts.mode = "r"
    #ts.mode = "c"
    ts.scale = 2 * 1000
    ts.show_leaf_name = False
    ts.min_leaf_separation = 50
    F = CircleFace(radius=.87 * size_factor, color="RoyalBlue", style="sphere")
    F.border.width = None
    F.opacity = 0.6
    ts.legend.add_face(F, 0)
    ts.legend.add_face(
        TextFace("  Inferred relative abundance",
                 fsize=1.5 * font_size,
                 fgcolor="Blue"), 1)
    ts.legend.add_face(
        TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8],
                 fsize=1.5 * font_size,
                 fgcolor="Black"), 1)
    ts.legend_position = 4
    #t.show(tree_style=ts)
    t.render(outfile, w=550, units="mm", tree_style=ts)

    #Redner the XML file
    project = Phyloxml()
    phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
    project.add_phylogeny(phylo)
    project.export(open(outfilexml, 'w'))
Exemple #6
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	try:
		opts, args = getopt.getopt(argv,"h:i:t:lno:w:x:",["Help=","InputFile=","Title=","LabelLeaves=", "LabelInternalNodes=","OutFile=","Width=","OutFileXML="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile> -x <OutFile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputFile"):
			input_file = arg
		elif opt in ("-t", "--Title"):
			title = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
			
	schema_names = COLOR_SCHEMES.keys()
	
	#Read the common kmer profile
	ckm_tax_paths = []
	ckm_name_to_perc = dict()
	fid = open(input_file,'r')
	file = fid.readlines()
	fid.close()
	
	#Put placeholders in for missing names like: "||" -> "|NA1|"
	file_noblank = list()
	i=0
	for line in file:
		while "||" in line:
			line = line.replace("||","|NONAME|",1)
			i = i+1
		file_noblank.append(line)
	
	#Get the names and weights
	for line in file_noblank:
		if line[0]!='#' and line[0]!='@' and line[0]!='\n': #Don't parse comments or blank lines
			temp = line.split()[3] #Get the names
			ckm_tax_paths.append(temp)
			ckm_name_to_perc[temp.split("|")[-1]] = line.split()[-1] #Get the weights
	
	#Create the tree
	t=Tree()
	names_to_nodes = dict()
	for i in range(0,len(ckm_tax_paths)):
		split_tax_path = ckm_tax_paths[i].split("|")
		if len(split_tax_path)==1: #If len==1, then it's a superkingdom
			names_to_nodes[split_tax_path[0]] = t.add_child(name=split_tax_path[0]) #connect directly to tree
		else:
			if split_tax_path[-2] in names_to_nodes: #If the parent is already in the tree, add to tree
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-2]].add_child(name=split_tax_path[-1])
			else: #Otherwise iterate up until we have something that is in the tree
				j=2
				while split_tax_path[-j]=="NONAME":
					j = j + 1
				#This skips over the NONAMES
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-j]].add_child(name=split_tax_path[-1])
	
	#Show the tree
	#print t.get_ascii(show_internal=True)
	
	#scheme = random.sample(schema_names, 1)[0] #'set2' is nice, 
	scheme = 'set2'

	def layout(node):
		if node.name in ckm_name_to_perc:
			ckm_perc = float(ckm_name_to_perc[node.name])
		else:
			ckm_perc = 0
		F = CircleFace(radius=3.14*math.sqrt(ckm_perc), color="RoyalBlue", style="sphere")
		F.border.width = None
		F.opacity = 0.6
		faces.add_face_to_node(F,node, 0, position="branch-right")
		if label_internal_nodes:
			faces.add_face_to_node(TextFace(node.name, fsize=7),node, 0, position="branch-top")
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	ts.show_leaf_name = label_leaves
	ts.min_leaf_separation = 50
	ts.title.add_face(TextFace(title, fsize=20), column=0)
	
	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
# Loads newick tree
phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1])

# Set basic tree info as a phyloxml phylogeny object
phylo.phyloxml_phylogeny.set_name("test_tree")
if len(phylo.children) <= 2:
    phylo.phyloxml_phylogeny.set_rooted("true")
else:
    phylo.phyloxml_phylogeny.set_rooted("false")

# Add the tree to the phyloxml project
project.add_phylogeny(phylo)

# Export phyloxml document
OUTPUT = StringIO()
project.export(OUTPUT)

# Some ad-hoc changes to the phyloxml formatted document to meet the schema definition
text = OUTPUT.getvalue()
text = text.replace("phy:", "")
text = re.sub('branch_length_attr="[^"]+"', "", text)
header = """
 <phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd"
 xmlns="http://www.phyloxml.org">
"""
text = re.sub('<Phyloxml[^>]+>', header, text)
text = text.replace('Phyloxml', 'phyloxml')

# Save result
open(sys.argv[1] + ".phyloxml", "w").write(text)
Exemple #8
0
def main(argv):
    input_file = ''
    title = 'Title'
    label_internal_nodes = False
    label_leaves = False
    out_file = ''
    width = 750
    out_file_xml = ''
    plot_rectangular = False
    common_kmer_data_path = ''
    taxonomic_names_on_leaves = False
    try:
        opts, args = getopt.getopt(argv, "h:i:lnrto:w:x:D:", [
            "Help=", "InputCommonKmerXFile=", "LabelLeaves=",
            "LabelInternalNodes=", "Rectangular=", "TaxonomicNamesOnLeaves=",
            "OutFile=", "Width=", "OutFileXML=", "CommonKmerDataPath="
        ])
    except getopt.GetoptError:
        print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
            sys.exit(2)
        elif opt in ("-i", "--InputCommonKmerXFile"):
            input_file = arg
        elif opt in ("-l", "--LabelLeaves"):
            label_leaves = True
        elif opt in ("-n", "--LabelInternalNodes"):
            label_internal_nodes = True
        elif opt in ("-o", "--OutFile"):
            out_file = arg
        elif opt in ("-w", "--Width"):
            width = int(arg)
        elif opt in ("-x", "--OutFileXML"):
            out_file_xml = arg
        elif opt in ("-D", "--CommonKmerDataPath"):
            common_kmer_data_path = arg
        elif opt in ("-r", "--Rectangular"):
            plot_rectangular = True
        elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
            taxonomic_names_on_leaves = True

    #Read in the x vector
    fid = open(input_file, 'r')
    x = map(lambda y: float(y), fid.readlines())
    fid.close()

    #Normalize the x vector
    #x = map(lambda y: y/sum(x),x)

    #Read in the taxonomy
    taxonomy = list()
    fid = open(os.path.join(common_kmer_data_path, "Taxonomy.txt"), 'r')
    for line in fid:
        taxonomy.append(
            '_'.join(line.split()[0].split("_")[1:])
        )  #Just take the first line of the taxonomy (erasing the taxID)
    fid.close()

    #Read in the basis for the ckm matrices
    x_file_names = list()
    fid = open(os.path.join(common_kmer_data_path, "FileNames.txt"), 'r')
    for line in fid:
        x_file_names.append(os.path.basename(line.strip()))
    fid.close()

    #Read in the common kmer matrix
    f = h5py.File(
        os.path.join(common_kmer_data_path, 'CommonKmerMatrix-30mers.h5'), 'r')
    ckm30 = np.array(f['common_kmers'], dtype=np.float64)
    f.close()
    f = h5py.File(
        os.path.join(common_kmer_data_path, 'CommonKmerMatrix-50mers.h5'), 'r')
    ckm50 = np.array(f['common_kmers'], dtype=np.float64)
    f.close()
    ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30))
    ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50))
    num_rows = ckm30_norm.shape[0]
    num_cols = ckm30_norm.shape[1]
    names = x_file_names
    matrix = list()
    for i in range(num_rows):
        matrix.append([
            .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 *
            (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i])
            for j in range(i + 1)
        ])

    #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
    dm = _DistanceMatrix(names, matrix)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    t = Tree(tree.format('newick'), format=1)

    #tree.format('newick')
    #Phylo.draw_ascii(tree)

    #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
    #Function to insert a node at a given distance
    def insert_node(t, name_to_insert, insert_above, dist_along):
        insert_at_node = t.search_nodes(name=insert_above)[0]
        parent = (t & insert_above).up
        orig_branch_length = t.get_distance(insert_at_node, parent)
        if orig_branch_length < dist_along:
            raise ValueError(
                "error: dist_along larger than orig_branch_length")
        removed_node = insert_at_node.detach()
        removed_node.dist = orig_branch_length - dist_along
        added_node = parent.add_child(name=name_to_insert, dist=dist_along)
        added_node.add_child(removed_node)

    #Function to insert a node some % along a branch
    def insert_hyp_node(t, leaf_name, percent):
        total_dist = t.get_distance(t.name, leaf_name)
        percent_dist = percent * total_dist
        child_node = (t & leaf_name)
        ancestor_node = (t & child_node.name).up
        while t.get_distance(t.name, ancestor_node) > percent_dist:
            child_node = ancestor_node
            ancestor_node = (t & child_node.name).up
        insert_node(t, leaf_name + "_" + str(percent), child_node.name,
                    percent_dist - t.get_distance(t.name, ancestor_node))

    #Insert hypothetical nodes
    hyp_node_names = dict()
    cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1]
    cutoffs = map(lambda y: y**1.5, cutoffs)
    for i in range(len(x_file_names)):
        xi = x[i:len(x):len(x_file_names)]
        for j in range(1, len(cutoffs) + 1):
            if xi[j] > 0:
                insert_hyp_node(t, x_file_names[i], cutoffs[j - 1])
                hyp_node_names[x_file_names[i] + "_" + str(cutoffs[j - 1])] = [
                    x_file_names[i], cutoffs[j - 1], j - 1
                ]  #in case there are "_" in the file names
                #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])

    #Now put the bubbles on the nodes
    def layout(node):
        #print(node)
        if node.is_leaf():
            if node.name in x_file_names:
                #make reconstructed bubble
                size = x[x_file_names.index(node.name)]
                F = CircleFace(radius=500 * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                if taxonomic_names_on_leaves:
                    nameFace = AttrFace(
                        "name",
                        fsize=25,
                        fgcolor='black',
                        text_suffix="_" +
                        taxonomy[x_file_names.index(node.name)])
                    faces.add_face_to_node(nameFace,
                                           node,
                                           0,
                                           position="branch-right")
                else:
                    nameFace = AttrFace("name", fsize=25, fgcolor='black')
                    faces.add_face_to_node(nameFace,
                                           node,
                                           0,
                                           position="branch-right")
        elif node.name in hyp_node_names:  #Otherwise it's a hypothetical node, just use recon x
            node_base_name = hyp_node_names[node.name][0]
            percent = hyp_node_names[node.name][1]
            if node_base_name in x_file_names:
                idx = hyp_node_names[node.name][2]
                size = x[x_file_names.index(node_base_name) +
                         (idx + 1) * len(x_file_names)]
                F = CircleFace(radius=500 * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #print node
                #print size
            else:
                size = 0
        else:
            size = 0
        #print(size)

    ts = TreeStyle()
    ts.layout_fn = layout
    if plot_rectangular:
        ts.mode = "r"
    else:
        ts.mode = "c"
    ts.show_leaf_name = False
    ts.min_leaf_separation = 50

    #Export the tree to a png image
    t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
    project = Phyloxml()
    phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
    phylo.phyloxml_phylogeny.set_name(title)
    project.add_phylogeny(phylo)
    project.export(open(out_file_xml, 'w'))