# Loads newick tree phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1]) # Set basic tree info as a phyloxml phylogeny object phylo.phyloxml_phylogeny.set_name("test_tree") if len(phylo.children) <= 2: phylo.phyloxml_phylogeny.set_rooted("true") else: phylo.phyloxml_phylogeny.set_rooted("false") # Add the tree to the phyloxml project project.add_phylogeny(phylo) # Export phyloxml document OUTPUT = StringIO() project.export(OUTPUT) # Some ad-hoc changes to the phyloxml formatted document to meet the schema definition text = OUTPUT.getvalue() text = text.replace("phy:", "") text = re.sub('branch_length_attr="[^"]+"', "", text) header = """ <phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd" xmlns="http://www.phyloxml.org"> """ text = re.sub('<Phyloxml[^>]+>', header, text) text = text.replace('Phyloxml', 'phyloxml') # Save result open(sys.argv[1]+".phyloxml", "w").write(text)
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x): #Make sure names are unique names = org_names for name in names: if names.count(name)>1: temp_name = name i=1 for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup names[names.index(temp_name)] = temp_name + "_" + str(i) i = i +1 #Normalize the x vector x = map(lambda y: y/sum(x),x) ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Make the list of distances (ave of the two ckm matrices) ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm ckm_ave_train_dist = dict() for i in range(len(org_names)): ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))] #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else) def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names): dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name]) nearby_indicies = list() #Add all the organisms that are within 0.05 of the given percent # for i in range(len(dists)): # if dists[i]<=.05: # nearby_indicies.append(i) nearby_names = list() #If there are no nearby indicies, add the closest organism to the given percent if nearby_indicies==[]: nearby_names.append(org_names[dists.index(min(dists))]) else: for i in range(len(nearby_indicies)): nearby_names.append(org_names[i]) mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names)) nearby_names.append(leaf_name) LCA = t.get_common_ancestor(nearby_names) LCA_to_leaf_dist = t.get_distance(LCA,leaf_name) #divide the dist to the right/left of the LCA node by the number of percentage points in there if LCA.name==t.name: percent_dist = percent*LCA_to_leaf_dist if mean_dist <= percent: child_node = (t&leaf_name) else: child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy ancestor_node = (t&child_node.name).up elif mean_dist <= percent: percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up else: percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Set outgroup if outgroup in names: t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names... else: print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ") print(names) print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.") #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs] for i in range(len(org_names)): xi = x[i:len(x):len(org_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names) hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names size_factor=250 font_size=55 #Now put the bubbles on the nodes def layout(node): node_style = NodeStyle() node_style["hz_line_width"] = 10 node_style["vt_line_width"] = 10 node.set_style(node_style) #print(node) if node.is_leaf(): if node.name in org_names: #make reconstructed bubble size = x[org_names.index(node.name)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #Denote that this was a training organism nameFace = AttrFace("name", fsize=font_size, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in org_names: idx = hyp_node_names[node.name][2] size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #This is if I want the names of the hypothetical nodes to be printed as well #nameFace = AttrFace("name", fsize=font_size, fgcolor='black') #faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: size=0 else: size=0 ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" #ts.mode = "c" ts.scale = 2*1000 ts.show_leaf_name = False ts.min_leaf_separation = 50 F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 ts.legend.add_face(F,0) ts.legend.add_face(TextFace(" Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1) ts.legend.add_face(TextFace(" Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1) ts.legend_position=4 #t.show(tree_style=ts) t.render(outfile, w=550, units="mm", tree_style=ts) #Redner the XML file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) project.add_phylogeny(phylo) project.export(open(outfilexml,'w'))
def main(argv): input_file='' title='Title' label_internal_nodes = False label_leaves = False out_file='' width=750 out_file_xml='' plot_rectangular = False common_kmer_data_path='' taxonomic_names_on_leaves = False try: opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputCommonKmerXFile"): input_file = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n","--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg elif opt in ("-D", "--CommonKmerDataPath"): common_kmer_data_path = arg elif opt in ("-r", "--Rectangular"): plot_rectangular = True elif opt in ("-t", "--TaxonomicNamesOnLeaves"): taxonomic_names_on_leaves = True #Read in the x vector fid = open(input_file,'r') x = map(lambda y: float(y),fid.readlines()) fid.close() #Normalize the x vector #x = map(lambda y: y/sum(x),x) #Read in the taxonomy taxonomy = list() fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r') for line in fid: taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID) fid.close() #Read in the basis for the ckm matrices x_file_names = list() fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r') for line in fid: x_file_names.append(os.path.basename(line.strip())) fid.close() #Read in the common kmer matrix f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r') ckm30=np.array(f['common_kmers'],dtype=np.float64) f.close() f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r') ckm50=np.array(f['common_kmers'],dtype=np.float64) f.close() ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] names = x_file_names matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch def insert_hyp_node(t, leaf_name, percent): total_dist = t.get_distance(t.name,leaf_name) percent_dist = percent*total_dist child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = map(lambda y: y**1.5,cutoffs) for i in range(len(x_file_names)): xi = x[i:len(x):len(x_file_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, x_file_names[i], cutoffs[j-1]) hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j]) #Now put the bubbles on the nodes def layout(node): #print(node) if node.is_leaf(): if node.name in x_file_names: #make reconstructed bubble size = x[x_file_names.index(node.name)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") if taxonomic_names_on_leaves: nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)]) faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: nameFace = AttrFace("name", fsize=25, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in x_file_names: idx = hyp_node_names[node.name][2] size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #print node #print size else: size=0 else: size=0 #print(size) ts = TreeStyle() ts.layout_fn = layout if plot_rectangular: ts.mode = "r" else: ts.mode = "c" ts.show_leaf_name = False ts.min_leaf_separation = 50 #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml,'w'))
# /-iajom # /---| # | \-wiszh #----| # | /-xrygw # \---| # | /-gjlwx # \---| # \-ijvnk # Trees can be operated as normal ETE trees phylo.show() # Export the project as phyloXML format project.export() # <phy:Phyloxml xmlns:phy="http://www.phyloxml.org/1.10/phyloxml.xsd"> # <phy:phylogeny> # <phy:name>test_tree</phy:name> # <phy:clade> # <phy:name>NoName</phy:name> # <phy:branch_length>0.000000e+00</phy:branch_length> # <phy:confidence type="branch_support">1.0</phy:confidence> # <phy:clade> # <phy:name>NoName</phy:name> # <phy:branch_length>1.665083e-01</phy:branch_length> # <phy:confidence type="branch_support">0.938507980435</phy:confidence> # <phy:clade> # <phy:name>NoName</phy:name> # <phy:branch_length>1.366655e-01</phy:branch_length>
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x): #Make sure names are unique names = org_names for name in names: if names.count(name) > 1: temp_name = name i = 1 for dummy in range( 0, names.count(name) - 1 ): #Don't change the last one, just to make sure we don't conflict with the outgroup names[names.index(temp_name)] = temp_name + "_" + str(i) i = i + 1 #Normalize the x vector x = map(lambda y: y / sum(x), x) ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30)) ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] matrix = list() for i in range(num_rows): matrix.append([ .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 * (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i]) for j in range(i + 1) ]) #Make the list of distances (ave of the two ckm matrices) ckm_ave_train = .5 * ckm30_norm + .5 * ckm50_norm ckm_ave_train_dist = dict() for i in range(len(org_names)): ckm_ave_train_dist[org_names[i]] = [ .5 * ckm_ave_train[i, j] + .5 * ckm_ave_train[j, i] for j in range(len(org_names)) ] #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t = Tree(tree.format('newick'), format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t & insert_above).up orig_branch_length = t.get_distance(insert_at_node, parent) if orig_branch_length < dist_along: raise ValueError( "error: dist_along larger than orig_branch_length in PlotPackage.py" ) removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else) def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names): dists = map(lambda y: abs(y - percent), ckm_ave_train_dist[leaf_name]) nearby_indicies = list() #Add all the organisms that are within 0.05 of the given percent # for i in range(len(dists)): # if dists[i]<=.05: # nearby_indicies.append(i) nearby_names = list() #If there are no nearby indicies, add the closest organism to the given percent if nearby_indicies == []: nearby_names.append(org_names[dists.index(min(dists))]) else: for i in range(len(nearby_indicies)): nearby_names.append(org_names[i]) mean_dist = np.mean( map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)], nearby_names)) nearby_names.append(leaf_name) LCA = t.get_common_ancestor(nearby_names) LCA_to_leaf_dist = t.get_distance(LCA, leaf_name) #divide the dist to the right/left of the LCA node by the number of percentage points in there if LCA.name == t.name: percent_dist = percent * LCA_to_leaf_dist if mean_dist <= percent: child_node = (t & leaf_name) else: child_node = ( t & nearby_names[0] ) #This means "go up from root" in the direction of the nearest guy ancestor_node = (t & child_node.name).up elif mean_dist <= percent: percent_dist = t.get_distance(LCA) + abs(percent - mean_dist) * ( LCA_to_leaf_dist) / (1 - mean_dist) child_node = (t & leaf_name) ancestor_node = (t & child_node.name).up else: percent_dist = t.get_distance(LCA) - abs(percent - mean_dist) * ( t.get_distance(LCA)) / (mean_dist) child_node = (t & leaf_name) ancestor_node = (t & child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t & child_node.name).up insert_node(t, leaf_name + "_" + str(percent), child_node.name, percent_dist - t.get_distance(t.name, ancestor_node)) #Set outgroup if outgroup in names: t.set_outgroup( t & outgroup ) #I will need to check that this outgroup is actually one of the names... else: print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ") print(names) print( "Proceeding without setting an outgroup. This may cause results to be uninterpretable." ) #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1] cutoffs = [ -.5141 * (val**3) + 1.0932 * (val**2) + 0.3824 * val for val in cutoffs ] for i in range(len(org_names)): xi = x[i:len(x):len(org_names)] for j in range(1, len(cutoffs) + 1): if xi[j] > 0: insert_hyp_node(t, org_names[i], cutoffs[j - 1], ckm_ave_train_dist, org_names) hyp_node_names[org_names[i] + "_" + str(cutoffs[j - 1])] = [ org_names[i], cutoffs[j - 1], j - 1 ] #in case there are "_" in the file names size_factor = 250 font_size = 55 #Now put the bubbles on the nodes def layout(node): node_style = NodeStyle() node_style["hz_line_width"] = 10 node_style["vt_line_width"] = 10 node.set_style(node_style) #print(node) if node.is_leaf(): if node.name in org_names: #make reconstructed bubble size = x[org_names.index(node.name)] F = CircleFace(radius=size_factor * math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F, node, 0, position="branch-right") #Denote that this was a training organism nameFace = AttrFace("name", fsize=font_size, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in org_names: idx = hyp_node_names[node.name][2] size = x[org_names.index(node_base_name) + (idx + 1) * len(org_names)] F = CircleFace(radius=size_factor * math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F, node, 0, position="branch-right") #This is if I want the names of the hypothetical nodes to be printed as well #nameFace = AttrFace("name", fsize=font_size, fgcolor='black') #faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: size = 0 else: size = 0 ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" #ts.mode = "c" ts.scale = 2 * 1000 ts.show_leaf_name = False ts.min_leaf_separation = 50 F = CircleFace(radius=.87 * size_factor, color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 ts.legend.add_face(F, 0) ts.legend.add_face( TextFace(" Inferred relative abundance", fsize=1.5 * font_size, fgcolor="Blue"), 1) ts.legend.add_face( TextFace(" Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5 * font_size, fgcolor="Black"), 1) ts.legend_position = 4 #t.show(tree_style=ts) t.render(outfile, w=550, units="mm", tree_style=ts) #Redner the XML file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) project.add_phylogeny(phylo) project.export(open(outfilexml, 'w'))
def main(argv): input_file='' title='Title' label_internal_nodes = False label_leaves = False out_file='' width=750 out_file_xml='' try: opts, args = getopt.getopt(argv,"h:i:t:lno:w:x:",["Help=","InputFile=","Title=","LabelLeaves=", "LabelInternalNodes=","OutFile=","Width=","OutFileXML="]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile> -x <OutFile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputFile"): input_file = arg elif opt in ("-t", "--Title"): title = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n","--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg schema_names = COLOR_SCHEMES.keys() #Read the common kmer profile ckm_tax_paths = [] ckm_name_to_perc = dict() fid = open(input_file,'r') file = fid.readlines() fid.close() #Put placeholders in for missing names like: "||" -> "|NA1|" file_noblank = list() i=0 for line in file: while "||" in line: line = line.replace("||","|NONAME|",1) i = i+1 file_noblank.append(line) #Get the names and weights for line in file_noblank: if line[0]!='#' and line[0]!='@' and line[0]!='\n': #Don't parse comments or blank lines temp = line.split()[3] #Get the names ckm_tax_paths.append(temp) ckm_name_to_perc[temp.split("|")[-1]] = line.split()[-1] #Get the weights #Create the tree t=Tree() names_to_nodes = dict() for i in range(0,len(ckm_tax_paths)): split_tax_path = ckm_tax_paths[i].split("|") if len(split_tax_path)==1: #If len==1, then it's a superkingdom names_to_nodes[split_tax_path[0]] = t.add_child(name=split_tax_path[0]) #connect directly to tree else: if split_tax_path[-2] in names_to_nodes: #If the parent is already in the tree, add to tree names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-2]].add_child(name=split_tax_path[-1]) else: #Otherwise iterate up until we have something that is in the tree j=2 while split_tax_path[-j]=="NONAME": j = j + 1 #This skips over the NONAMES names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-j]].add_child(name=split_tax_path[-1]) #Show the tree #print t.get_ascii(show_internal=True) #scheme = random.sample(schema_names, 1)[0] #'set2' is nice, scheme = 'set2' def layout(node): if node.name in ckm_name_to_perc: ckm_perc = float(ckm_name_to_perc[node.name]) else: ckm_perc = 0 F = CircleFace(radius=3.14*math.sqrt(ckm_perc), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") if label_internal_nodes: faces.add_face_to_node(TextFace(node.name, fsize=7),node, 0, position="branch-top") ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" ts.show_leaf_name = label_leaves ts.min_leaf_separation = 50 ts.title.add_face(TextFace(title, fsize=20), column=0) #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml,'w'))
# Loads newick tree phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1]) # Set basic tree info as a phyloxml phylogeny object phylo.phyloxml_phylogeny.set_name("test_tree") if len(phylo.children) <= 2: phylo.phyloxml_phylogeny.set_rooted("true") else: phylo.phyloxml_phylogeny.set_rooted("false") # Add the tree to the phyloxml project project.add_phylogeny(phylo) # Export phyloxml document OUTPUT = StringIO() project.export(OUTPUT) # Some ad-hoc changes to the phyloxml formatted document to meet the schema definition text = OUTPUT.getvalue() text = text.replace("phy:", "") text = re.sub('branch_length_attr="[^"]+"', "", text) header = """ <phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd" xmlns="http://www.phyloxml.org"> """ text = re.sub('<Phyloxml[^>]+>', header, text) text = text.replace('Phyloxml', 'phyloxml') # Save result open(sys.argv[1] + ".phyloxml", "w").write(text)
def main(argv): input_file = '' title = 'Title' label_internal_nodes = False label_leaves = False out_file = '' width = 750 out_file_xml = '' plot_rectangular = False common_kmer_data_path = '' taxonomic_names_on_leaves = False try: opts, args = getopt.getopt(argv, "h:i:lnrto:w:x:D:", [ "Help=", "InputCommonKmerXFile=", "LabelLeaves=", "LabelInternalNodes=", "Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=", "Width=", "OutFileXML=", "CommonKmerDataPath=" ]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputCommonKmerXFile"): input_file = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n", "--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg elif opt in ("-D", "--CommonKmerDataPath"): common_kmer_data_path = arg elif opt in ("-r", "--Rectangular"): plot_rectangular = True elif opt in ("-t", "--TaxonomicNamesOnLeaves"): taxonomic_names_on_leaves = True #Read in the x vector fid = open(input_file, 'r') x = map(lambda y: float(y), fid.readlines()) fid.close() #Normalize the x vector #x = map(lambda y: y/sum(x),x) #Read in the taxonomy taxonomy = list() fid = open(os.path.join(common_kmer_data_path, "Taxonomy.txt"), 'r') for line in fid: taxonomy.append( '_'.join(line.split()[0].split("_")[1:]) ) #Just take the first line of the taxonomy (erasing the taxID) fid.close() #Read in the basis for the ckm matrices x_file_names = list() fid = open(os.path.join(common_kmer_data_path, "FileNames.txt"), 'r') for line in fid: x_file_names.append(os.path.basename(line.strip())) fid.close() #Read in the common kmer matrix f = h5py.File( os.path.join(common_kmer_data_path, 'CommonKmerMatrix-30mers.h5'), 'r') ckm30 = np.array(f['common_kmers'], dtype=np.float64) f.close() f = h5py.File( os.path.join(common_kmer_data_path, 'CommonKmerMatrix-50mers.h5'), 'r') ckm50 = np.array(f['common_kmers'], dtype=np.float64) f.close() ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30)) ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] names = x_file_names matrix = list() for i in range(num_rows): matrix.append([ .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 * (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i]) for j in range(i + 1) ]) #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t = Tree(tree.format('newick'), format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t & insert_above).up orig_branch_length = t.get_distance(insert_at_node, parent) if orig_branch_length < dist_along: raise ValueError( "error: dist_along larger than orig_branch_length") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch def insert_hyp_node(t, leaf_name, percent): total_dist = t.get_distance(t.name, leaf_name) percent_dist = percent * total_dist child_node = (t & leaf_name) ancestor_node = (t & child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t & child_node.name).up insert_node(t, leaf_name + "_" + str(percent), child_node.name, percent_dist - t.get_distance(t.name, ancestor_node)) #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1] cutoffs = map(lambda y: y**1.5, cutoffs) for i in range(len(x_file_names)): xi = x[i:len(x):len(x_file_names)] for j in range(1, len(cutoffs) + 1): if xi[j] > 0: insert_hyp_node(t, x_file_names[i], cutoffs[j - 1]) hyp_node_names[x_file_names[i] + "_" + str(cutoffs[j - 1])] = [ x_file_names[i], cutoffs[j - 1], j - 1 ] #in case there are "_" in the file names #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j]) #Now put the bubbles on the nodes def layout(node): #print(node) if node.is_leaf(): if node.name in x_file_names: #make reconstructed bubble size = x[x_file_names.index(node.name)] F = CircleFace(radius=500 * math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F, node, 0, position="branch-right") if taxonomic_names_on_leaves: nameFace = AttrFace( "name", fsize=25, fgcolor='black', text_suffix="_" + taxonomy[x_file_names.index(node.name)]) faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: nameFace = AttrFace("name", fsize=25, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in x_file_names: idx = hyp_node_names[node.name][2] size = x[x_file_names.index(node_base_name) + (idx + 1) * len(x_file_names)] F = CircleFace(radius=500 * math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F, node, 0, position="branch-right") #print node #print size else: size = 0 else: size = 0 #print(size) ts = TreeStyle() ts.layout_fn = layout if plot_rectangular: ts.mode = "r" else: ts.mode = "c" ts.show_leaf_name = False ts.min_leaf_separation = 50 #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml, 'w'))