Exemple #1
0
def read_phyloxml(input_file):
    """
    Parses a pyhlogenetic tree in phyloxml-format.
    :param str input_file: path to file
    :return: ete2.Tree object
    """
    from ete2 import Phyloxml
    project = Phyloxml()
    project.build_from_file(input_file)

    return project.get_phylogeny()
Exemple #2
0
 def getRelevantEdges( adjGraph, t1, t2 ):
   pxml = Phyloxml()
   pxml.build_from_file(t1)
   pxml.build_from_file(t2)
   la = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[0].get_leaf_names() )
   lb = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[1].get_leaf_names() )
   #lb = filter( lambda x : x.find('LOST') == -1, map( getName, cogent.LoadTree(t2).tips() ) )
   crossValidationEdges = filter( lambda (x,y) : ((x in la) and (y in lb)) or ((y  in la) and (x in lb))  , adjGraph.edges() )
   relevantEdges = filter( lambda (x,y) : ((x in la) or (x in lb)) and ((y in la) or (y in lb)) , adjGraph.edges() )
   newGraph = nx.Graph()
   newGraph.add_nodes_from( la + lb )
   newGraph.add_edges_from( relevantEdges )
   return newGraph, crossValidationEdges
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True):
	"""
	Read a phylogeny tree from a phyloxml string and return a TreeClass object
	or a list of TreeClass object
	"""
	project = Phyloxml()
	fo=open(saveToFile, "w+")
	fo.write(xml)
	fo.close()
	project.build_from_file(saveToFile)
	treeList=[]
	for tree in project.get_phylogeny():
		treeList.append(TreeClass.import_from_PhyloxmlTree(tree))

	if(delFile):
		os.remove(saveToFile)
	if len(treeList)==1:
		return treeList[0]
	return treeList
def readScoreFile(fname, noself, randomize=False):

  # The strings naming the proteins whose interaction was removed in
  # this input
  tstring = fname.split('@')[-2].split("#")
  # Convert to upper case and make into an edge name
  tedge = (tstring[0].upper(), tstring[1].upper())

  # Read in the phylogenies for the orthology groups
  treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged"#"dataOut_June17/rearranged"
  baseFile = fname.split('/')[-1]
  orthoGroup1 = baseFile.split('@')[0]
  orthoGroup2 = baseFile.split('@')[1]
  t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup1)
  t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup2)

  if ( not (os.path.exists(t1) and os.path.exists(t2)) ):
    return None, None

  # The extant (non ancestral, non lost) nodes from the two homology groups
  getName = lambda x : x.Name.upper()
  pxml = Phyloxml()
  pxml.build_from_file(t1)
  pxml.build_from_file(t2)
  la = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names() ) )
  lb = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names() ) )

  # The set of all possible interactions among the two homology groups
  #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb))
  possibleEndpoints =  combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \
                       if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la)

  # From among all possible endpoints, only those protein pairs that reside in the
  # same species represent a potential edge                      
  allPossibleEdges = filter( lambda (x,y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints )
    
  # an edge has both endpoints in the set of extant nodes
  inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[1] in x or e[1] in y)
  # an edge is relevant if it's constrained to the current groups
  relevantExtantEdges = [ e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e,la,lb) ]
  # the set of potential edges that don't appear in the input network
  nonPresentEdgesMinusTarget = list(set([ x for x in allPossibleEdges if not ExtantNetwork.has_edge(x[0],x[1])]))
  # the same as above but including our target edge
  nonPresentEdges = nonPresentEdgesMinusTarget + [tedge]

  import random

  # Ancestral edges start with an N or R
  ancestral = ['R','N']

  # The node is valid if it is neither lost nor ancestral
  isValidNode = lambda x : (x[0] not in ancestral) and (x.find('LOST') == -1)

  # Is the edge u,v the target edge?
  isCurrentEdge = lambda u,v : (u == tedge[0] and v == tedge[1]) or (u == tedge[1] and v == tedge[0])
  isRealEdge = lambda u,v : (not isCurrentEdge(u,v)) and ExtantNetwork.has_edge(u,v)
  isValidEdge = lambda u,v : ((isValidNode(u) and isValidNode(v)) and (not isRealEdge(u,v)))

  # Is u,v one of the edges we wish to consider?
  def inPotentialEdges(u,v) :
    contains = (u,v) in nonPresentEdges or (v,u) in nonPresentEdges
    if noself:
      return u != v and contains
    else:
      return contains

  def isEdge( se, p1, p2 ):
      r =  ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1))
      return r

  scoredEdges = []

  nonEdgesWithProb = set( nonPresentEdges )
  with open(fname,'rb') as ifile:
    for l in ifile:
      toks = l.rstrip().split()
      p1 = toks[0].upper()
      p2 = toks[1].upper()
      s = float(toks[3])
      if inPotentialEdges(p1,p2):
        if randomize: s = random.uniform(0.0,1.0)
        #if p1 == p2: s = 0.0
        se = ScoredEdge(p1,p2,s)
        scoredEdges.append( se )
        nonEdgesWithProb.discard((p1,p2))
        nonEdgesWithProb.discard((p2,p1))

  rev = True
  for u,v in (nonEdgesWithProb - set(nonPresentEdges)):
    s = random.uniform(0.0,1.0) if randomize else 0.0
    scoredEdges.append(ScoredEdge(u, v, s))
  
  # cost = 0.0
  # for u,v in nonPresentEdges:
  #     se = ScoredEdge(u,v,cost)
  #     fe = [ e for e in scoredEdges if isEdge(e, u, v) ]
  #     if len(fe) == 0:
  #         scoredEdges.append(se)

  random.shuffle(scoredEdges)
  scoredEdges = list(enumerate(sorted( scoredEdges, key=lambda x: x.score, reverse=rev )))
  # print(len(scoredEdges))
  # print(t1,t2)
  # print("Target Edge = {0}".format(tedge))
  # print("Extant Edges = {0}".format(relevantExtantEdges))
  # print("Potential Edges = {0}".format(nonPresentEdges))
  # print("Scored Edges = {0}".format(scoredEdges))

  res = [ x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1])  ]

  if len(res) > 0:
    print(res)
    # Prev (ISMB)
    #print(res[0][0],float(len(nonPresentEdges)-1))
    #return (res[0][0], float(len(nonPresentEdges)-1))
    # New
    #print(res[0][0],float(len(scoredEdges)-1))
    return (res[0][0], float(len(scoredEdges)-1))
    
  else:
    raise 'Hell'
def main ():

    global options, args

    if options.verbose: print time.asctime(),
    if options.verbose: print "load and parse newick file"
    # TODO: read newick file
    tree = Phylo.read(args[0],'newick') 
    # TODO: convert newick to phyloxml
    treeXML = StringIO()
    Phylo.write(tree,treeXML,'phyloxml')
    # TODO: read phyloxml as ete object
    hPhylotree = Phyloxml()
    with tempinput(treeXML.getvalue()) as tempfilename:   
        hPhylotree.build_from_file(tempfilename)
    # TODO: get the tree
    tree2 = hPhylotree.get_phylogeny()[0]
       
    if options.verbose: print time.asctime(),
    if options.verbose: print "load and parse taxonomy file"
    # TODO: read taxonomy file
    tax = get_taxonomy(args[1])
    # TODO: refine taxonomy annotation of internal node
    tree2 = add_taxonomy_for_internal_branch(tree2,tax)
    # TODO: refine tree node label
    #tree2 = add_node_label(tree2,tax)
    for node in tree2.traverse():
        if not node.is_leaf():
            label = "null"
            for t in ['kingdom','phylum','class','order','family','genus','species']:
                if len(tax[node.id][t])>3:
                    label = tax[node.id][t]
            node.add_feature("mylabel",label)

    # TODO: add node depth
    depth={}
    if options.depth:
        with open(options.depth) as f:
            for line in f:
                (id,dep) = line.split()
                depth[id] = float(dep)
   
    # TODO: add color attribute
    if options.depth:
        for node in tree2.iter_leaves():
            if depth[node.id] >= 10 and depth[node.id] < 100:
                node.add_feature("color","#D8BFD8")
            elif depth[node.id] >= 100 and depth[node.id] < 1000:
                node.add_feature("color","#DDA0DD")
            elif depth[node.id] >= 1000 and depth[node.id] < 5000:
                node.add_feature("color","#EE82EE")
            elif depth[node.id] >= 5000:
                node.add_feature("color","#DA70D6")
            else:
                node.add_feature("color","#E6E6FA")
        
    # TODO: set tree style
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.layout_fn = tree_layout
    # TODO: show tree2
    #tree2.show(tree_style=ts)
    tree2.render(args[0]+".png",dpi=2048,tree_style=ts)
import sys
import re
from StringIO import StringIO

from ete2 import Phyloxml, phyloxml

#Creates empty phyloxml document
project = Phyloxml()

# Loads newick tree
phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1])

# Set basic tree info as a phyloxml phylogeny object
phylo.phyloxml_phylogeny.set_name("test_tree")
if len(phylo.children) <= 2:
    phylo.phyloxml_phylogeny.set_rooted("true")
else:
    phylo.phyloxml_phylogeny.set_rooted("false")
    
# Add the tree to the phyloxml project
project.add_phylogeny(phylo)

# Export phyloxml document
OUTPUT = StringIO()
project.export(OUTPUT)

# Some ad-hoc changes to the phyloxml formatted document to meet the schema definition
text = OUTPUT.getvalue()
text = text.replace("phy:", "")
text = re.sub('branch_length_attr="[^"]+"', "", text)
header = """
Exemple #7
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	plot_rectangular = False
	common_kmer_data_path=''
	taxonomic_names_on_leaves = False
	try:
		opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputCommonKmerXFile"):
			input_file = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
		elif opt in ("-D", "--CommonKmerDataPath"):
			common_kmer_data_path = arg
		elif opt in ("-r", "--Rectangular"):
			plot_rectangular = True
		elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
			taxonomic_names_on_leaves = True
	
	
	#Read in the x vector
	fid = open(input_file,'r')
	x = map(lambda y: float(y),fid.readlines())
	fid.close()
	
	#Normalize the x vector
	#x = map(lambda y: y/sum(x),x)
	
	#Read in the taxonomy
	taxonomy = list()
	fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r')
	for line in fid:
		taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID)
	fid.close()
	
	#Read in the basis for the ckm matrices
	x_file_names = list()
	fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r')
	for line in fid:
		x_file_names.append(os.path.basename(line.strip()))
	fid.close()
	
	#Read in the common kmer matrix
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r')
	ckm30=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r')
	ckm50=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	names = x_file_names
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])
	
	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)
	
	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)
	
	#Function to insert a node some % along a branch
	def insert_hyp_node(t, leaf_name, percent):
		total_dist = t.get_distance(t.name,leaf_name)
		percent_dist = percent*total_dist
		child_node = (t&leaf_name)
		ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))
	
	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = map(lambda y: y**1.5,cutoffs)
	for i in range(len(x_file_names)):
		xi = x[i:len(x):len(x_file_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, x_file_names[i], cutoffs[j-1])
				hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names
				#insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])
	
	#Now put the bubbles on the nodes
	def layout(node):
		#print(node)
		if node.is_leaf():
			if node.name in x_file_names:
				#make reconstructed bubble
				size = x[x_file_names.index(node.name)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				if taxonomic_names_on_leaves:
					nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)])
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
				else:
					nameFace = AttrFace("name", fsize=25, fgcolor='black')
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in x_file_names:
				idx = hyp_node_names[node.name][2]
				size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#print node
				#print size
			else:
				size=0
		else:
			size=0
		#print(size)
	
	ts = TreeStyle()
	ts.layout_fn = layout
	if plot_rectangular:
		ts.mode = "r"
	else:
		ts.mode = "c"
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50

	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
Exemple #8
0
def readScoreFile(fname, noself, randomize=False):

    # The strings naming the proteins whose interaction was removed in
    # this input
    tstring = fname.split('@')[-2].split("#")
    # Convert to upper case and make into an edge name
    tedge = (tstring[0].upper(), tstring[1].upper())

    # Read in the phylogenies for the orthology groups
    treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged"  #"dataOut_June17/rearranged"
    baseFile = fname.split('/')[-1]
    orthoGroup1 = baseFile.split('@')[0]
    orthoGroup2 = baseFile.split('@')[1]
    t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(
        treeDir, orthoGroup1)
    t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(
        treeDir, orthoGroup2)

    if (not (os.path.exists(t1) and os.path.exists(t2))):
        return None, None

    # The extant (non ancestral, non lost) nodes from the two homology groups
    getName = lambda x: x.Name.upper()
    pxml = Phyloxml()
    pxml.build_from_file(t1)
    pxml.build_from_file(t2)
    la = filter(lambda x: x.find('LOST') == -1,
                map(lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names()))
    lb = filter(lambda x: x.find('LOST') == -1,
                map(lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names()))

    # The set of all possible interactions among the two homology groups
    #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb))
    possibleEndpoints =  combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \
                         if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la)

    # From among all possible endpoints, only those protein pairs that reside in the
    # same species represent a potential edge
    allPossibleEdges = filter(
        lambda (x, y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints)

    # an edge has both endpoints in the set of extant nodes
    inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[
        1] in x or e[1] in y)
    # an edge is relevant if it's constrained to the current groups
    relevantExtantEdges = [
        e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e, la, lb)
    ]
    # the set of potential edges that don't appear in the input network
    nonPresentEdgesMinusTarget = list(
        set([
            x for x in allPossibleEdges
            if not ExtantNetwork.has_edge(x[0], x[1])
        ]))
    # the same as above but including our target edge
    nonPresentEdges = nonPresentEdgesMinusTarget + [tedge]

    import random

    # Ancestral edges start with an N or R
    ancestral = ['R', 'N']

    # The node is valid if it is neither lost nor ancestral
    isValidNode = lambda x: (x[0] not in ancestral) and (x.find('LOST') == -1)

    # Is the edge u,v the target edge?
    isCurrentEdge = lambda u, v: (u == tedge[0] and v == tedge[1]) or (
        u == tedge[1] and v == tedge[0])
    isRealEdge = lambda u, v: (not isCurrentEdge(u, v)
                               ) and ExtantNetwork.has_edge(u, v)
    isValidEdge = lambda u, v: (
        (isValidNode(u) and isValidNode(v)) and (not isRealEdge(u, v)))

    # Is u,v one of the edges we wish to consider?
    def inPotentialEdges(u, v):
        contains = (u, v) in nonPresentEdges or (v, u) in nonPresentEdges
        if noself:
            return u != v and contains
        else:
            return contains

    def isEdge(se, p1, p2):
        r = ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1))
        return r

    scoredEdges = []

    nonEdgesWithProb = set(nonPresentEdges)
    with open(fname, 'rb') as ifile:
        for l in ifile:
            toks = l.rstrip().split()
            p1 = toks[0].upper()
            p2 = toks[1].upper()
            s = float(toks[3])
            if inPotentialEdges(p1, p2):
                if randomize: s = random.uniform(0.0, 1.0)
                #if p1 == p2: s = 0.0
                se = ScoredEdge(p1, p2, s)
                scoredEdges.append(se)
                nonEdgesWithProb.discard((p1, p2))
                nonEdgesWithProb.discard((p2, p1))

    rev = True
    for u, v in (nonEdgesWithProb - set(nonPresentEdges)):
        s = random.uniform(0.0, 1.0) if randomize else 0.0
        scoredEdges.append(ScoredEdge(u, v, s))

    # cost = 0.0
    # for u,v in nonPresentEdges:
    #     se = ScoredEdge(u,v,cost)
    #     fe = [ e for e in scoredEdges if isEdge(e, u, v) ]
    #     if len(fe) == 0:
    #         scoredEdges.append(se)

    random.shuffle(scoredEdges)
    scoredEdges = list(
        enumerate(sorted(scoredEdges, key=lambda x: x.score, reverse=rev)))
    # print(len(scoredEdges))
    # print(t1,t2)
    # print("Target Edge = {0}".format(tedge))
    # print("Extant Edges = {0}".format(relevantExtantEdges))
    # print("Potential Edges = {0}".format(nonPresentEdges))
    # print("Scored Edges = {0}".format(scoredEdges))

    res = [x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1])]

    if len(res) > 0:
        print(res)
        # Prev (ISMB)
        #print(res[0][0],float(len(nonPresentEdges)-1))
        #return (res[0][0], float(len(nonPresentEdges)-1))
        # New
        #print(res[0][0],float(len(scoredEdges)-1))
        return (res[0][0], float(len(scoredEdges) - 1))

    else:
        raise 'Hell'
Exemple #9
0
from ete2 import Phyloxml
project = Phyloxml()
project.build_from_file("apaf.xml")

# Each tree contains the same methods as a PhyloTree object
for tree in project.get_phylogeny():
    print tree
    # you can even use rendering options
    tree.show()
    # PhyloXML features are stored in the phyloxml_clade attribute
    for node in tree: 
        print "Node name:", node.name
        for seq in node.phyloxml_clade.get_sequence(): 
            for domain in seq.domain_architecture.get_domain():
                domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()]
                print "  Domain:", '\t'.join(map(str, domain_data))
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):
	
	#Make sure names are unique
	names = org_names
	for name in names:
		if names.count(name)>1:
			temp_name = name
			i=1
			for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup
				names[names.index(temp_name)] = temp_name + "_" + str(i)
				i = i +1
		
	#Normalize the x vector
	x = map(lambda y: y/sum(x),x)
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])

	#Make the list of distances (ave of the two ckm matrices)
	ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm
	ckm_ave_train_dist = dict()
	for i in range(len(org_names)):
		ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))]

	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)

	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)

	#Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
	def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
		dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name])
		nearby_indicies = list()
		#Add all the organisms that are within 0.05 of the given percent
	#	for i in range(len(dists)):
	#		if dists[i]<=.05:
	#			nearby_indicies.append(i)
		nearby_names = list()
		#If there are no nearby indicies, add the closest organism to the given percent
		if nearby_indicies==[]:
			nearby_names.append(org_names[dists.index(min(dists))])
		else:
			for i in range(len(nearby_indicies)):
				nearby_names.append(org_names[i])
		mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names))
		nearby_names.append(leaf_name)
		LCA = t.get_common_ancestor(nearby_names)
		LCA_to_leaf_dist = t.get_distance(LCA,leaf_name)
		#divide the dist to the right/left of the LCA node by the number of percentage points in there
		if LCA.name==t.name:
			percent_dist = percent*LCA_to_leaf_dist
			if mean_dist <= percent:
				child_node = (t&leaf_name)
			else:
				child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy
			ancestor_node = (t&child_node.name).up
		elif mean_dist <= percent:
			percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		else:
			percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))

	#Set outgroup
	if outgroup in names:
		t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names...
	else:
		print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ")
		print(names)
		print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.")

	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs]
	for i in range(len(org_names)):
		xi = x[i:len(x):len(org_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names)
				hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names

	size_factor=250
	font_size=55

	#Now put the bubbles on the nodes
	def layout(node):
		node_style = NodeStyle()
		node_style["hz_line_width"] = 10
		node_style["vt_line_width"] = 10
		node.set_style(node_style)
		#print(node)
		if node.is_leaf():
			if node.name in org_names:
				#make reconstructed bubble
				size = x[org_names.index(node.name)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#Denote that this was a training organism
				nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in org_names:
				idx = hyp_node_names[node.name][2]
				size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#This is if I want the names of the hypothetical nodes to be printed as well
				#nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				#faces.add_face_to_node(nameFace, node, 0, position="branch-right")
			else:
				size=0
		else:
			size=0
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	#ts.mode = "c"
	ts.scale = 2*1000
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50
	F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere")
	F.border.width = None
	F.opacity = 0.6
	ts.legend.add_face(F,0)
	ts.legend.add_face(TextFace("  Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1)
	ts.legend.add_face(TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1)
	ts.legend_position=4
	#t.show(tree_style=ts)
	t.render(outfile, w=550, units="mm", tree_style=ts)
	
	#Redner the XML file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	project.add_phylogeny(phylo)
	project.export(open(outfilexml,'w'))
Exemple #11
0
from ete2 import Phyloxml
project = Phyloxml()
project.build_from_file("testTree.xml")

# Each tree contains the same methods as a PhyloTree object
for tree in project.get_phylogeny():
    print tree
    # you can even use rendering options
    tree.show()
    # PhyloXML features are stored in the phyloxml_clade attribute
    for node in tree: 
        print "Node name:", node.name
        for seq in node.phyloxml_clade.get_sequence(): 
            for domain in seq.domain_architecture.get_domain():
                domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()]
                print "  Domain:", '\t'.join(map(str, domain_data))
Exemple #12
0
from ete2 import Phyloxml, phyloxml
import random 
project = Phyloxml()

# Creates a random tree
phylo = phyloxml.PhyloxmlTree()
phylo.populate(5, random_branches=True)
phylo.phyloxml_phylogeny.set_name("test_tree")
# Add the tree to the phyloxml project
project.add_phylogeny(phylo)

print project.get_phylogeny()[0]

#          /-iajom
#     /---|
#    |     \-wiszh
#----|
#    |     /-xrygw
#     \---|
#         |     /-gjlwx
#          \---|
#               \-ijvnk

# Trees can be operated as normal ETE trees
phylo.show()


# Export the project as phyloXML format
project.export()

# <phy:Phyloxml xmlns:phy="http://www.phyloxml.org/1.10/phyloxml.xsd">
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):

    #Make sure names are unique
    names = org_names
    for name in names:
        if names.count(name) > 1:
            temp_name = name
            i = 1
            for dummy in range(
                    0,
                    names.count(name) - 1
            ):  #Don't change the last one, just to make sure we don't conflict with the outgroup
                names[names.index(temp_name)] = temp_name + "_" + str(i)
                i = i + 1

    #Normalize the x vector
    x = map(lambda y: y / sum(x), x)
    ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30))
    ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50))
    num_rows = ckm30_norm.shape[0]
    num_cols = ckm30_norm.shape[1]
    matrix = list()
    for i in range(num_rows):
        matrix.append([
            .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 *
            (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i])
            for j in range(i + 1)
        ])

    #Make the list of distances (ave of the two ckm matrices)
    ckm_ave_train = .5 * ckm30_norm + .5 * ckm50_norm
    ckm_ave_train_dist = dict()
    for i in range(len(org_names)):
        ckm_ave_train_dist[org_names[i]] = [
            .5 * ckm_ave_train[i, j] + .5 * ckm_ave_train[j, i]
            for j in range(len(org_names))
        ]

    #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
    dm = _DistanceMatrix(names, matrix)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    t = Tree(tree.format('newick'), format=1)

    #tree.format('newick')
    #Phylo.draw_ascii(tree)

    #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
    #Function to insert a node at a given distance
    def insert_node(t, name_to_insert, insert_above, dist_along):
        insert_at_node = t.search_nodes(name=insert_above)[0]
        parent = (t & insert_above).up
        orig_branch_length = t.get_distance(insert_at_node, parent)
        if orig_branch_length < dist_along:
            raise ValueError(
                "error: dist_along larger than orig_branch_length in PlotPackage.py"
            )
        removed_node = insert_at_node.detach()
        removed_node.dist = orig_branch_length - dist_along
        added_node = parent.add_child(name=name_to_insert, dist=dist_along)
        added_node.add_child(removed_node)

    #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
    def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
        dists = map(lambda y: abs(y - percent), ckm_ave_train_dist[leaf_name])
        nearby_indicies = list()
        #Add all the organisms that are within 0.05 of the given percent
        #	for i in range(len(dists)):
        #		if dists[i]<=.05:
        #			nearby_indicies.append(i)
        nearby_names = list()
        #If there are no nearby indicies, add the closest organism to the given percent
        if nearby_indicies == []:
            nearby_names.append(org_names[dists.index(min(dists))])
        else:
            for i in range(len(nearby_indicies)):
                nearby_names.append(org_names[i])
        mean_dist = np.mean(
            map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],
                nearby_names))
        nearby_names.append(leaf_name)
        LCA = t.get_common_ancestor(nearby_names)
        LCA_to_leaf_dist = t.get_distance(LCA, leaf_name)
        #divide the dist to the right/left of the LCA node by the number of percentage points in there
        if LCA.name == t.name:
            percent_dist = percent * LCA_to_leaf_dist
            if mean_dist <= percent:
                child_node = (t & leaf_name)
            else:
                child_node = (
                    t & nearby_names[0]
                )  #This means "go up from root" in the direction of the nearest guy
            ancestor_node = (t & child_node.name).up
        elif mean_dist <= percent:
            percent_dist = t.get_distance(LCA) + abs(percent - mean_dist) * (
                LCA_to_leaf_dist) / (1 - mean_dist)
            child_node = (t & leaf_name)
            ancestor_node = (t & child_node.name).up
        else:
            percent_dist = t.get_distance(LCA) - abs(percent - mean_dist) * (
                t.get_distance(LCA)) / (mean_dist)
            child_node = (t & leaf_name)
            ancestor_node = (t & child_node.name).up
        while t.get_distance(t.name, ancestor_node) > percent_dist:
            child_node = ancestor_node
            ancestor_node = (t & child_node.name).up
        insert_node(t, leaf_name + "_" + str(percent), child_node.name,
                    percent_dist - t.get_distance(t.name, ancestor_node))

    #Set outgroup
    if outgroup in names:
        t.set_outgroup(
            t & outgroup
        )  #I will need to check that this outgroup is actually one of the names...
    else:
        print("WARNING: the chosen outgroup " + outgroup +
              " is not in the given taxonomy: ")
        print(names)
        print(
            "Proceeding without setting an outgroup. This may cause results to be uninterpretable."
        )

    #Insert hypothetical nodes
    hyp_node_names = dict()
    cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1]
    cutoffs = [
        -.5141 * (val**3) + 1.0932 * (val**2) + 0.3824 * val for val in cutoffs
    ]
    for i in range(len(org_names)):
        xi = x[i:len(x):len(org_names)]
        for j in range(1, len(cutoffs) + 1):
            if xi[j] > 0:
                insert_hyp_node(t, org_names[i], cutoffs[j - 1],
                                ckm_ave_train_dist, org_names)
                hyp_node_names[org_names[i] + "_" + str(cutoffs[j - 1])] = [
                    org_names[i], cutoffs[j - 1], j - 1
                ]  #in case there are "_" in the file names

    size_factor = 250
    font_size = 55

    #Now put the bubbles on the nodes
    def layout(node):
        node_style = NodeStyle()
        node_style["hz_line_width"] = 10
        node_style["vt_line_width"] = 10
        node.set_style(node_style)
        #print(node)
        if node.is_leaf():
            if node.name in org_names:
                #make reconstructed bubble
                size = x[org_names.index(node.name)]
                F = CircleFace(radius=size_factor * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #Denote that this was a training organism
                nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
                faces.add_face_to_node(nameFace,
                                       node,
                                       0,
                                       position="branch-right")
        elif node.name in hyp_node_names:  #Otherwise it's a hypothetical node, just use recon x
            node_base_name = hyp_node_names[node.name][0]
            percent = hyp_node_names[node.name][1]
            if node_base_name in org_names:
                idx = hyp_node_names[node.name][2]
                size = x[org_names.index(node_base_name) +
                         (idx + 1) * len(org_names)]
                F = CircleFace(radius=size_factor * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #This is if I want the names of the hypothetical nodes to be printed as well
                #nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
                #faces.add_face_to_node(nameFace, node, 0, position="branch-right")
            else:
                size = 0
        else:
            size = 0

    ts = TreeStyle()
    ts.layout_fn = layout
    ts.mode = "r"
    #ts.mode = "c"
    ts.scale = 2 * 1000
    ts.show_leaf_name = False
    ts.min_leaf_separation = 50
    F = CircleFace(radius=.87 * size_factor, color="RoyalBlue", style="sphere")
    F.border.width = None
    F.opacity = 0.6
    ts.legend.add_face(F, 0)
    ts.legend.add_face(
        TextFace("  Inferred relative abundance",
                 fsize=1.5 * font_size,
                 fgcolor="Blue"), 1)
    ts.legend.add_face(
        TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8],
                 fsize=1.5 * font_size,
                 fgcolor="Black"), 1)
    ts.legend_position = 4
    #t.show(tree_style=ts)
    t.render(outfile, w=550, units="mm", tree_style=ts)

    #Redner the XML file
    project = Phyloxml()
    phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
    project.add_phylogeny(phylo)
    project.export(open(outfilexml, 'w'))
Exemple #14
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	try:
		opts, args = getopt.getopt(argv,"h:i:t:lno:w:x:",["Help=","InputFile=","Title=","LabelLeaves=", "LabelInternalNodes=","OutFile=","Width=","OutFileXML="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile> -x <OutFile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputFile"):
			input_file = arg
		elif opt in ("-t", "--Title"):
			title = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
			
	schema_names = COLOR_SCHEMES.keys()
	
	#Read the common kmer profile
	ckm_tax_paths = []
	ckm_name_to_perc = dict()
	fid = open(input_file,'r')
	file = fid.readlines()
	fid.close()
	
	#Put placeholders in for missing names like: "||" -> "|NA1|"
	file_noblank = list()
	i=0
	for line in file:
		while "||" in line:
			line = line.replace("||","|NONAME|",1)
			i = i+1
		file_noblank.append(line)
	
	#Get the names and weights
	for line in file_noblank:
		if line[0]!='#' and line[0]!='@' and line[0]!='\n': #Don't parse comments or blank lines
			temp = line.split()[3] #Get the names
			ckm_tax_paths.append(temp)
			ckm_name_to_perc[temp.split("|")[-1]] = line.split()[-1] #Get the weights
	
	#Create the tree
	t=Tree()
	names_to_nodes = dict()
	for i in range(0,len(ckm_tax_paths)):
		split_tax_path = ckm_tax_paths[i].split("|")
		if len(split_tax_path)==1: #If len==1, then it's a superkingdom
			names_to_nodes[split_tax_path[0]] = t.add_child(name=split_tax_path[0]) #connect directly to tree
		else:
			if split_tax_path[-2] in names_to_nodes: #If the parent is already in the tree, add to tree
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-2]].add_child(name=split_tax_path[-1])
			else: #Otherwise iterate up until we have something that is in the tree
				j=2
				while split_tax_path[-j]=="NONAME":
					j = j + 1
				#This skips over the NONAMES
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-j]].add_child(name=split_tax_path[-1])
	
	#Show the tree
	#print t.get_ascii(show_internal=True)
	
	#scheme = random.sample(schema_names, 1)[0] #'set2' is nice, 
	scheme = 'set2'

	def layout(node):
		if node.name in ckm_name_to_perc:
			ckm_perc = float(ckm_name_to_perc[node.name])
		else:
			ckm_perc = 0
		F = CircleFace(radius=3.14*math.sqrt(ckm_perc), color="RoyalBlue", style="sphere")
		F.border.width = None
		F.opacity = 0.6
		faces.add_face_to_node(F,node, 0, position="branch-right")
		if label_internal_nodes:
			faces.add_face_to_node(TextFace(node.name, fsize=7),node, 0, position="branch-top")
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	ts.show_leaf_name = label_leaves
	ts.min_leaf_separation = 50
	ts.title.add_face(TextFace(title, fsize=20), column=0)
	
	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
import sys
import re
from StringIO import StringIO

from ete2 import Phyloxml, phyloxml

#Creates empty phyloxml document
project = Phyloxml()

# Loads newick tree
phylo = phyloxml.PhyloxmlTree(newick=sys.argv[1])

# Set basic tree info as a phyloxml phylogeny object
phylo.phyloxml_phylogeny.set_name("test_tree")
if len(phylo.children) <= 2:
    phylo.phyloxml_phylogeny.set_rooted("true")
else:
    phylo.phyloxml_phylogeny.set_rooted("false")

# Add the tree to the phyloxml project
project.add_phylogeny(phylo)

# Export phyloxml document
OUTPUT = StringIO()
project.export(OUTPUT)

# Some ad-hoc changes to the phyloxml formatted document to meet the schema definition
text = OUTPUT.getvalue()
text = text.replace("phy:", "")
text = re.sub('branch_length_attr="[^"]+"', "", text)
header = """
Exemple #16
0
def main(argv):
    input_file = ''
    title = 'Title'
    label_internal_nodes = False
    label_leaves = False
    out_file = ''
    width = 750
    out_file_xml = ''
    plot_rectangular = False
    common_kmer_data_path = ''
    taxonomic_names_on_leaves = False
    try:
        opts, args = getopt.getopt(argv, "h:i:lnrto:w:x:D:", [
            "Help=", "InputCommonKmerXFile=", "LabelLeaves=",
            "LabelInternalNodes=", "Rectangular=", "TaxonomicNamesOnLeaves=",
            "OutFile=", "Width=", "OutFileXML=", "CommonKmerDataPath="
        ])
    except getopt.GetoptError:
        print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
            sys.exit(2)
        elif opt in ("-i", "--InputCommonKmerXFile"):
            input_file = arg
        elif opt in ("-l", "--LabelLeaves"):
            label_leaves = True
        elif opt in ("-n", "--LabelInternalNodes"):
            label_internal_nodes = True
        elif opt in ("-o", "--OutFile"):
            out_file = arg
        elif opt in ("-w", "--Width"):
            width = int(arg)
        elif opt in ("-x", "--OutFileXML"):
            out_file_xml = arg
        elif opt in ("-D", "--CommonKmerDataPath"):
            common_kmer_data_path = arg
        elif opt in ("-r", "--Rectangular"):
            plot_rectangular = True
        elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
            taxonomic_names_on_leaves = True

    #Read in the x vector
    fid = open(input_file, 'r')
    x = map(lambda y: float(y), fid.readlines())
    fid.close()

    #Normalize the x vector
    #x = map(lambda y: y/sum(x),x)

    #Read in the taxonomy
    taxonomy = list()
    fid = open(os.path.join(common_kmer_data_path, "Taxonomy.txt"), 'r')
    for line in fid:
        taxonomy.append(
            '_'.join(line.split()[0].split("_")[1:])
        )  #Just take the first line of the taxonomy (erasing the taxID)
    fid.close()

    #Read in the basis for the ckm matrices
    x_file_names = list()
    fid = open(os.path.join(common_kmer_data_path, "FileNames.txt"), 'r')
    for line in fid:
        x_file_names.append(os.path.basename(line.strip()))
    fid.close()

    #Read in the common kmer matrix
    f = h5py.File(
        os.path.join(common_kmer_data_path, 'CommonKmerMatrix-30mers.h5'), 'r')
    ckm30 = np.array(f['common_kmers'], dtype=np.float64)
    f.close()
    f = h5py.File(
        os.path.join(common_kmer_data_path, 'CommonKmerMatrix-50mers.h5'), 'r')
    ckm50 = np.array(f['common_kmers'], dtype=np.float64)
    f.close()
    ckm30_norm = np.multiply(ckm30, 1 / np.diag(ckm30))
    ckm50_norm = np.multiply(ckm50, 1 / np.diag(ckm50))
    num_rows = ckm30_norm.shape[0]
    num_cols = ckm30_norm.shape[1]
    names = x_file_names
    matrix = list()
    for i in range(num_rows):
        matrix.append([
            .5 * (1 - .5 * ckm30_norm[i, j] - .5 * ckm30_norm[j, i]) + .5 *
            (1 - .5 * ckm50_norm[i, j] - .5 * ckm50_norm[j, i])
            for j in range(i + 1)
        ])

    #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
    dm = _DistanceMatrix(names, matrix)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    t = Tree(tree.format('newick'), format=1)

    #tree.format('newick')
    #Phylo.draw_ascii(tree)

    #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
    #Function to insert a node at a given distance
    def insert_node(t, name_to_insert, insert_above, dist_along):
        insert_at_node = t.search_nodes(name=insert_above)[0]
        parent = (t & insert_above).up
        orig_branch_length = t.get_distance(insert_at_node, parent)
        if orig_branch_length < dist_along:
            raise ValueError(
                "error: dist_along larger than orig_branch_length")
        removed_node = insert_at_node.detach()
        removed_node.dist = orig_branch_length - dist_along
        added_node = parent.add_child(name=name_to_insert, dist=dist_along)
        added_node.add_child(removed_node)

    #Function to insert a node some % along a branch
    def insert_hyp_node(t, leaf_name, percent):
        total_dist = t.get_distance(t.name, leaf_name)
        percent_dist = percent * total_dist
        child_node = (t & leaf_name)
        ancestor_node = (t & child_node.name).up
        while t.get_distance(t.name, ancestor_node) > percent_dist:
            child_node = ancestor_node
            ancestor_node = (t & child_node.name).up
        insert_node(t, leaf_name + "_" + str(percent), child_node.name,
                    percent_dist - t.get_distance(t.name, ancestor_node))

    #Insert hypothetical nodes
    hyp_node_names = dict()
    cutoffs = [.9, .8, .7, .6, .5, .4, .3, .2, .1]
    cutoffs = map(lambda y: y**1.5, cutoffs)
    for i in range(len(x_file_names)):
        xi = x[i:len(x):len(x_file_names)]
        for j in range(1, len(cutoffs) + 1):
            if xi[j] > 0:
                insert_hyp_node(t, x_file_names[i], cutoffs[j - 1])
                hyp_node_names[x_file_names[i] + "_" + str(cutoffs[j - 1])] = [
                    x_file_names[i], cutoffs[j - 1], j - 1
                ]  #in case there are "_" in the file names
                #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])

    #Now put the bubbles on the nodes
    def layout(node):
        #print(node)
        if node.is_leaf():
            if node.name in x_file_names:
                #make reconstructed bubble
                size = x[x_file_names.index(node.name)]
                F = CircleFace(radius=500 * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                if taxonomic_names_on_leaves:
                    nameFace = AttrFace(
                        "name",
                        fsize=25,
                        fgcolor='black',
                        text_suffix="_" +
                        taxonomy[x_file_names.index(node.name)])
                    faces.add_face_to_node(nameFace,
                                           node,
                                           0,
                                           position="branch-right")
                else:
                    nameFace = AttrFace("name", fsize=25, fgcolor='black')
                    faces.add_face_to_node(nameFace,
                                           node,
                                           0,
                                           position="branch-right")
        elif node.name in hyp_node_names:  #Otherwise it's a hypothetical node, just use recon x
            node_base_name = hyp_node_names[node.name][0]
            percent = hyp_node_names[node.name][1]
            if node_base_name in x_file_names:
                idx = hyp_node_names[node.name][2]
                size = x[x_file_names.index(node_base_name) +
                         (idx + 1) * len(x_file_names)]
                F = CircleFace(radius=500 * math.sqrt(size),
                               color="RoyalBlue",
                               style="sphere")
                F.border.width = None
                F.opacity = 0.6
                faces.add_face_to_node(F, node, 0, position="branch-right")
                #print node
                #print size
            else:
                size = 0
        else:
            size = 0
        #print(size)

    ts = TreeStyle()
    ts.layout_fn = layout
    if plot_rectangular:
        ts.mode = "r"
    else:
        ts.mode = "c"
    ts.show_leaf_name = False
    ts.min_leaf_separation = 50

    #Export the tree to a png image
    t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
    project = Phyloxml()
    phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
    phylo.phyloxml_phylogeny.set_name(title)
    project.add_phylogeny(phylo)
    project.export(open(out_file_xml, 'w'))