def main(argv): inputfile = '' try: opts, args = getopt.getopt(argv, "ht:l:r:", ["tree=", "leaves=", "root="]) except getopt.GetoptError: print 'test.py -t <treefile> -l <leave,leave,etc.> -r <root>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -t <tree> -l <leave,leave,etc.> -r <root>' sys.exit() elif opt in ("-t", "--tree"): tree = arg elif opt in ("-l", "--leaves"): leaves = arg elif opt in ("-r", "--root"): root = arg list = leaves.split(',') t = Tree(tree) t.set_outgroup(root) print t.check_monophyly(values=list, target_attr="name")
#map tip IDs to domain tip_to_domain = {} inh = open("genomemetadata.tsv") for line in inh: fields = re.split("\t", line.rstrip()) tip_to_domain[fields[0]] = fields[21] inh.close() num_arch = 0 num_bact = 0 tree = Tree(target_tree) for tip in tree: the_domain = tip_to_domain[tip.name] if the_domain == "Archaea": num_arch += 1 elif the_domain == "Bacteria": num_bact += 1 else: print("Problem with " + str(tip.name) + "'s domain assignment.") tip.add_feature("domain",the_domain) print(sys.argv[1] + "\tNum_arch\t" + str(num_arch)) print(sys.argv[1] + "\tNum_bact\t" + str(num_bact)) #now check domain monophyly arch = tree.check_monophyly(values=["Archaea"], target_attr="domain")[0:2] bact = tree.check_monophyly(values=["Bacteria"], target_attr="domain")[0:2] print(sys.argv[1] + "\t" + str(arch[0]) + "\t" + str(arch[1]) + "\t" + str(bact[0]) + "\t" + str(bact[1]))
for j in range(i): line += "\t" line += "\t" + str(100) for j in range(i + 1, len(files)): filej = files[j] tj = Tree(filej) nb_ti = 0 nb_tj = 0 nb_common_clade = 0 for node in ti.traverse("postorder"): if (not node.is_leaf()): nb_ti += 1 leaves = [] for leaf in node: leaves.append(leaf.name) if (tj.check_monophyly(values=leaves, target_attr="name")[0]): nb_common_clade += 1 for node in tj.traverse("postorder"): if (not node.is_leaf()): nb_tj += 1 percent_common_clade = 100.0 * nb_common_clade / min(nb_ti, nb_tj) line += "\t" + str(round(percent_common_clade, 2)) stat.write(line + "\n") stat.close() print("Statistics on common clades generated in " + str(time.time() - start) + " seconds") print(common_clades_stat_file) ############################################################################### # Generate statistics on branch supports
def caluclate_rootstrap(treeFile, bootFile, is_rooted, out_group): ''' Parameters ---------- treeFile: rooted tree in newick format (.treefile in IQ-TREE) bootFile: rooted bootstrap trees in newick format (e.g. .ufboot file in IQ-TREE) rooted: if the bootstrap trees are rooted (defult is True). If not rooted provide outgroup taxa file og: A file with outgroup taxa in Nexus format Returns ------- rootstrapTree: rooted tree with rootstrap support values as branch lengths in newick format ''' boottrees = [] trees = [] polyphyly = 0 N_boottrees = 0 if not is_rooted: if out_group == None: raise SystemExit('Error: Please provide outgroup taxa in Nexus format') ML_tree = Tree(treeFile) try: og = Read_Nex(out_group) #get the outgroup taxa except: raise SystemExit('Error: Cannot find outgroup taxa') if len(og) == 1: #if there is one outgroup taxon use it to root the tree ML_root = ML_tree.search_nodes(name=og[0])[0] else: #if there are more than one outgroup taxon find their common ancestor ML_root = ML_tree.get_common_ancestor(og) if not ML_root.is_root(): ML_tree.set_outgroup(ML_root) ingroup = [n.name for n in ML_tree.get_leaves() if n.name not in og] try:#check if the ingroup is monophyletic if ML_tree.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]: ML_tree.prune(ingroup) #prune ingroup taxa only rootedMLtree = os.path.splitext(treeFile)[0]+'_rooted.treefile' ML_tree.write(outfile=rootedMLtree) #write the rooted ML tree with ingroup taxa only to a file else: raise SystemExit('Error: ML ingroup taxa are not monophyletic') except: raise SystemExit('Error: ML ingroup taxa are not monophyletic') with open(bootFile, 'r') as f: for tree in f: N_boottrees += 1 t = Tree(tree) ingroup = [n.name for n in t.get_leaves() if n.name not in og] if len(og) == 1: #if there is one outgroup taxon use it to root the tree root = t.search_nodes(name=og[0])[0] elif len(og) > 1: #if there are more than one outgroup taxon find their common ancestor root = t.get_common_ancestor(og) else: #if there is no outgroup taxa raise an error raise SystemExit('Error: Please provide outgroup taxa in Nexus format') if not root.is_root(): t.set_outgroup(root) try:#check if the ingroup is monophyletic if t.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]: trees.append(t.write(format=9)) else: polyphyly += 1 except: polyphyly += 1 for tree in trees: t = Tree(tree) t.prune(ingroup) boottrees.append(t.write(format=9)) else: #If you are using rooted ML tree and rooted bootstrap trees (e.g. NR model) ML_tree = Tree(treeFile) with open(bootFile, 'r') as f: for tree in f: N_boottrees += 1 t = Tree(tree) boottrees.append(t.write(format=9)) booted = [(g[0], len(list(g[1]))) for g in ite.groupby(boottrees)] #a list of all unique bootstrap trees with thier number of occurrence boottrees = [] for b in booted: t2 = Tree(b[0]) x = [] for n in t2.traverse(): if n.is_root(): for child in n.children: if child.is_leaf(): x.append([child.name]) else: x.append([i.name for i in child.get_descendants()]) boottrees.append([b[1],x]) if is_rooted: roots = all_possible_roots(treeFile) else: roots = all_possible_roots(rootedMLtree) rootstrap_value = dict.fromkeys(roots.keys(), 0) for node, rooted in roots.items(): t1 = Tree(rooted) x = [] for n in t1.traverse(): if n.is_root(): for child in n.children: if child.is_leaf(): x.append([child.name]) else: x.append([i.name for i in child.get_descendants()]) y = [set(i) for i in x] for split in boottrees: z = [set(i) for i in split[1]] if len(y) == len(z): for group in y: if group in z: z.remove(group) if len(z) == 0: rootstrap_value[node] += split[0]/N_boottrees else: rootstrap_value[node] += 0 if is_rooted: t = Tree(treeFile) else: t = Tree(rootedMLtree) k = 1 for n in t.traverse(): if not n.is_root(): if not n.is_leaf(): n.add_features(name='n'+str(k)) n.add_features(rootstrap=rootstrap_value[n.name]*100) k += 1 else: n.add_features(rootstrap=rootstrap_value[n.name]*100) temp = os.path.splitext(treeFile)[0]+'.temp' rootstrapTree = os.path.splitext(treeFile)[0]+'.rootstrap' t.write(outfile=temp, features =["rootstrap"]) x = dendropy.Tree.get(path=temp, schema='newick') x.write(path=rootstrapTree, schema='nexus') os.remove(temp) return polyphyly
leaf.add_features(domain="Eukaryote") eukaryote_seqs.append(leaf.name) target_leaf = leaf else: leaf.add_features(domain="Other") #print eukaryote_seqs #test the various phylogenetic criteria for LGT. #euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate print sys.argv[1] + "\tSingleton" #euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree #print len(eukaryote_seqs) else: try: answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name") if answer[0] == True: ca = tree.get_common_ancestor(eukaryote_seqs) print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) elif answer[0] == False: mono_groups = [] target_group = '' for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"): if target_leaf in node: target_group = node else: mono_groups.append(node) size_target_group = len(target_group) #get distance shortest_distance = 999999999999999.0 closest_other_group = ''
tree = Tree(sys.argv[1]) print tree archaea = [] #make a list of archaea that are in the tree bacteria = [] #check the domain of each taxon in the tree for taxon in tree: print taxon.name + "\t" + id_to_domain[taxon.name] if id_to_domain[taxon.name] == 'Archaea': archaea.append(taxon.name) else: bacteria.append(taxon.name) #first, check if archaea are monophyletic in the tree if tree.check_monophyly(values=archaea, target_attr="name")[0] == True: #find the branch separating archaea and bacteria, and reroot the tree on that archaea_ancestor = tree.get_common_ancestor(archaea) tree.set_outgroup(archaea_ancestor) elif tree.check_monophyly(values=bacteria, target_attr="name")[0] == True: bacteria_ancestor = tree.get_common_ancestor(bacteria) tree.set_outgroup(bacteria_ancestor) else: #neither archaea nor bacteria were monophyletic, so print some error and quit print sys.argv[1] + ": neither A nor B monophyletic." quit() outfile_name = sys.argv[1] + "_rerooted" tree.write(outfile=outfile_name)
class ClusterIdentification(object): def __init__(self): self.PercentileThreshold = {} self.dictSharedReads = {} self.dictClusters = {} self.monoFinalRes = [] self.count = 0 self.SerialNodes = {} self.t = Tree(TreeFile) self.nodesRemoved = [] self.nodecheck = [] ##The Split method identifies the percentile threshold for each sample from the results of PatDistSpectrum.py ##This threshold is determined from user input in the command line def Split(self, infile): Percentiles = { "0": "1", "1": "2", "5": "3", "10": "4", "20": "5", "25": "6", "30": "7", "35": "8", "40": "9", "45": "10", "50": "11", "75": "12", "90": "13", "99": "14", "100": "15", } with open(Spectrum, "r") as file1: for line in file1: if not "Samples" in line: linerep = line.replace(" ", "") if percentile in Percentiles: cutoff = Percentiles[percentile] else: sys.stdout.write( "Please specify the percentile as a number (0,1,5,10,20,25,30,35,40,50,75,90,100)" ) sys.exit(1) linesp = linerep.rstrip("\n").split("\t") nodes = linesp[0] nodesSp = nodes.split("__") if nodesSp[0] == nodesSp[1]: combNode = nodesSp[0] + "__" + nodesSp[1] self.PercentileThreshold[combNode] = linesp[int(cutoff)] return self.PercentileThreshold # Identifies all variants passing the threshold defined in the Split method def variantCollection(self): PatDistSpec = self.Split(Spectrum) with open(PatDistOutput, "r") as file2: for line in file2: linesp = line.split(",") node = linesp[0] nodeshort = node[idStart:idLen] nodedouble = nodeshort + "__" + nodeshort mateshort = linesp[1][idStart:idLen] matedouble = mateshort + "__" + mateshort mate = linesp[1] patdist = linesp[2] support = linesp[3].rstrip("\n") comb = nodeshort + "__" + mateshort comb2 = mateshort + "__" + nodeshort # Store variants that are below the respective pat dist threshold defined in PatDistSpec if nodeshort != mateshort: if float(PatDistSpec[nodedouble]) <= float(PatDistSpec[matedouble]): target = float(PatDistSpec[nodedouble]) else: target = float(PatDistSpec[matedouble]) if float(patdist) <= float(target): if str(supportInput) == "PASS": if not comb2 in self.dictSharedReads: if not comb in self.dictSharedReads: self.dictSharedReads[comb] = [] if not node in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(node) if not mate in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(mate) else: if not node in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(node) if not mate in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(mate) elif not support == "None": if float(supportInput) <= float(support): if not comb2 in self.dictSharedReads: if not comb in self.dictSharedReads: self.dictSharedReads[comb] = [] if not node in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(node) if not mate in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(mate) else: if not node in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(node) if not mate in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(mate) ##Identifying potential outliers is optional (-oR flag from the command line ) # Based on the retrieve common ancestor function, it identifies outliers as those which contain < 3 intra-variants associated with a given sample def PhylyOutlierRem(self, n, node1, node2, OutlierFile, idStart, idLen): PhyloOutliers = {} ancestorList = [] ancshort = [] node1short = node1[idStart:idLen] node2short = node2[idStart:idLen] nodecomb = node1short + "__" + node2short nodecombRev = node2short + "__" + node2short if not nodecomb or not nodecombRev in self.monoFinalRes: if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: # Collect all common ancestors for each pair of variants ancestor = self.t.get_common_ancestor(n) for i in ancestor: ancestorList.append(i.name) ancestorShort = i.name[idStart:idLen] if not ancestorShort in PhyloOutliers: PhyloOutliers[ancestorShort] = [] PhyloOutliers[ancestorShort].append(1) # Sum the variants for each sample, if < 3, store variant as outlier for k, v in PhyloOutliers.iteritems(): vsum = sum(v) if vsum < 3: for item in ancestorList: if item[idStart:idLen] == k: if not item in self.nodesRemoved: if node1short in self.SerialNodes: if not node2short in self.SerialNodes[node1short]: ancestorList.remove(item) self.nodesRemoved.append(item) elif node2short in self.SerialNodes: if not node1short in self.SerialNodes[node2short]: ancestorList.remove(item) self.nodesRemoved.append(item) else: ancestorList.remove(item) self.nodesRemoved.append(item) for i in self.nodesRemoved: if not i in self.nodecheck: try: item = self.t.search_nodes(name=item)[0] i.delete() self.nodecheck.append(i) except: pass return ancestorList # Create all combinations of intrahost sample identifiers for each respective sequential sample set # These results are used to assist in PhylyOutlierRem def intraComb(self, infile): with open(IntraFile) as f: for line in f: line = line.rstrip("\n") linesp = line.split(",") length = len(linesp) comb = int(length) for i in linesp: self.SerialNodes[i] = [] for pair in itertools.combinations(linesp, 2): for item in pair: if i != item: if not item in self.SerialNodes[i]: self.SerialNodes[i].append(item) return self.SerialNodes # First step of merging overlapping pairs of connected samples def ClusterKeys(self, values, node1, node2): if node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values: for k, v in self.dictClusters.iteritems(): if node1 in self.dictClusters[k]: self.dictClusters[k].append(node2) if node2 in [x for v in values for x in v if type(v) == list] or node2 in values: if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values: for k, v in self.dictClusters.iteritems(): if node2 in self.dictClusters[k]: self.dictClusters[k].append(node1) if node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if node2 in [x for v in values for x in v if type(v) == list] or node2 in values: for k, v in self.dictClusters.iteritems(): if node1 in self.dictClusters[k]: self.dictClusters[k].append(node2) self.dictClusters[k].append(node1) if node2 in self.dictClusters[k]: self.dictClusters[k].append(node2) self.dictClusters[k].append(node1) if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values: self.count += 1 if not self.count in self.dictClusters: self.dictClusters[self.count] = [] self.dictClusters[self.count].append(node1) self.dictClusters[self.count].append(node2.rstrip("\n")) # Second step of merging overlapping pairs of connected samples def ClusterKeys2(self, dictClusters): Clustvals = {} sysvers = str(sys.version_info[0]) + "." + str(sys.version_info[1]) if float(sysvers) == 2.7: ##For python 2.7 Clustvals = {k: set(val) for k, val in self.dictClusters.items()} elif float(sysvers) == 2.6: ##For python 2.6 Clustvals = dict((k, val) for (k, val) in self.dictClusters.items()) merged = set() srt = sorted(self.dictClusters.keys()) srt2 = srt[:] for key in srt: for k in srt2: if not k == key: if Clustvals[k].intersection(self.dictClusters[key]) and key not in merged: merged.add(k) self.dictClusters[key] = list(Clustvals[k].union(self.dictClusters[key])) srt2.remove(k) for k in merged: del self.dictClusters[k] try: if len(self.dictClusters) > 0: del self.dictClusters[0] except: pass ValLengths = [] ItemNumber = [] for k, v in self.dictClusters.iteritems(): ValLengths.append(int(len(set(v)))) for i in v: if not i in ItemNumber: ItemNumber.append(i) ValLengths[:] = [] for k, v in self.dictClusters.iteritems(): vset = set(v) v[:] = [] vset = list(vset) self.dictClusters[k] = str(vset) return self.dictClusters # Retrieve common ancestors def CommonAncestor(self, nodes): ancestors = [] ancestor = self.t.get_common_ancestor(nodes) for i in ancestor: ancestors.append(i.name) return ancestors # Identify poly- , para-, and monophyletic pairs of variants def CheckMono(self, ncomb, PhyloVarRemoval, Rejects, monoFinal): monoResult = str( self.t.check_monophyly(values=PhyloVarRemoval, ignore_missing=True, target_attr="name", unrooted=True) ) monoResultSp = monoResult.split(",") mR = monoResultSp[1].replace("'", "").replace(")", "").replace(" ", "") if "monophyletic" in mR: if not ncomb in monoFinal: monoFinal[ncomb] = [] monoFinal[ncomb].append(mR) if not ncomb in self.monoFinalRes: self.monoFinalRes.append(ncomb) return True elif "paraphyletic" in mR: if not ncomb in monoFinal: monoFinal[ncomb] = [] monoFinal[ncomb].append(mR) if not ncomb in self.monoFinalRes: self.monoFinalRes.append(ncomb) elif not ncomb in Rejects: Rejects.append(ncomb) ##Analysis identifies all ancestors to variants passing the required percentile thresholds # Following the removal of outliers, it parses through every combination of these variants to determine whether the pair is polyphyletic or not def variantAnalysis(self): monoFinal = {} self.variantCollection() if outlierFlag == "TRUE": OutlierFile = open(outputPath + TreeShort + "." + percentile + "." + supportInput + ".Outlier.txt", "w") try: self.intraComb(IntraFile) except: pass Rejects = [] for k, v in self.dictSharedReads.iteritems(): x = 0 count = 0 ksp = k.split("__") krev = ksp[1] + "__" + ksp[0] FinalList = [] clusters = self.dictSharedReads[k] for pair in itertools.combinations(clusters, 2): n = list(pair) node1 = n[0] node2 = n[1] if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: nodeList = [] node1short = str(pair)[(idStart + 2) : (idLen + 2)] pairSp = str(pair).split(",") node2short = pairSp[1].replace(" ", "").replace("'", "")[idStart:idLen] nshort = [node1short, node2short] ncomb = node1short + "__" + node2short ncombRev = node2short + "__" + node1short if node1short != node2short: if not ncomb or not ncombRev in self.monoFinalRes: if outlierFlag == "TRUE": PhyloVarRemoval = self.PhylyOutlierRem(n, node1, node2, OutlierFile, idStart, idLen) else: PhyloVarRemoval = self.CommonAncestor(n) for i in PhyloVarRemoval: node = i[idStart:idLen] if not node in nodeList: nodeList.append(node) if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: if len(nodeList) == 2: if not ncomb or not ncombRev in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break elif len(nodeList) > 2: monoPos = 0 lengthNode = len(nodeList) flag = 0 nodeCheck = 0 nodeRemoval = [] for i in set(nodeList): if not i in nshort: if not i in self.SerialNodes: nodeRemoval.append(i) flag = 1 if flag == 0: if len(PhyloVarRemoval) > 1: if not ncomb or not ncombRev in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break else: for item in nodeRemoval: nodeRemovalShort = item[idStart:idLen] if not nodeRemovalShort + "__" + node1short in self.dictSharedReads.keys(): if ( not node1short + "__" + nodeRemovalShort in self.dictSharedReads.keys() ): if ( not node2short + "__" + nodeRemovalShort in self.dictSharedReads.keys() ): if ( not nodeRemovalShort + "__" + node2short in self.dictSharedReads.keys() ): for i in PhyloVarRemoval: nodeShort = i[idStart:idLen] if nodeShort in nodeRemoval: PhyloVarRemoval.remove(i) if len(PhyloVarRemoval) > 1: if not ncomb in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break flag = 0 if outlierFlag == "TRUE": for i in set(self.nodesRemoved): OutlierFile.write("%s\n" % i) self.dictClusters = {} self.count = 0 for i in self.monoFinalRes: if not "polyphyletic" in i: if not self.count in self.dictClusters: self.dictClusters[self.count] = [] values = self.dictClusters.values() isp = i.split("__") node1 = isp[0] node2 = isp[1].split("\t")[0] ClusterKeys = self.ClusterKeys(values, node1, node2) try: FinalClustering = self.ClusterKeys2(ClusterKeys) except: print "WARNING: Patristic Distance Data files may be empty..." sys.exit(1) for k, v in monoFinal.iteritems(): print k + "\t" + str(v) print "Clusters that are polyphyletic: " + str(Rejects) return FinalClustering
#read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end) #might need to alter taxonomy assignment so that we check for the presence of the believed groups at all levels of the taxonomy. ml_tree = Tree(sys.argv[1]) for leaf in ml_tree: taxonomy = check_for_favourite_taxonomy(leaf.name) taxa_names.append(leaf.name) leaf.add_feature("tax", taxonomy) #this needs to label with the favoured group, or else "none" or something. TODO. if taxonomy == "none": continue else: labels[taxonomy] = 1 groups = labels.keys() #need to add something above to get a list of the believed labels which are actually found in the tree. For the moment, we'll use groups (=labels.keys()). #for each of our favourite believed groups, ask whether all sequences from that group are monophyletic. total_believed_groups = len(groups) mono_believed_groups = 0 for label in groups: val = ml_tree.check_monophyly(values=[label], target_attr="tax", unrooted=True) #print(val) print(label + "\t" + str(val[0]) + "\t" + str(val[1])) if val[0] == True: mono_believed_groups += 1 else: for ele in val[2]: print(ele.get_ascii()) # mono_believed_groups += 1 # print(label) print(sys.argv[1] + " score: " + str(float(mono_believed_groups)/float(total_believed_groups)))
def CheckMonophyly(self,PDlist): t = Tree(filePath+TreeFile) monoShort=[] x=0 for item in PDlist: cluster=[] pairL=[] flag = 0 y=0 clusterRaw=str(item).replace("[","").replace("]","").replace('"',"").replace(" ","").replace("'",'').split(',') for i in clusterRaw: if not i in cluster: cluster.append(i) monoResult = str(t.check_monophyly(values=cluster, ignore_missing=True,target_attr="name",unrooted=True)) #Identify poly- , para-, and monophyletic relationships for clusters if 'monophyletic' in monoResult: for pair in itertools.combinations(cluster,2): m = list(pair) if not m in self.monoPairs: self.monoPairs.append(m) elif 'paraphyletic' in monoResult: for pair in itertools.combinations(cluster,2): m = list(pair) if not m in self.monoPairs: self.monoPairs.append(m) else: cluster2 = [] for pair in itertools.combinations(cluster,2): n = list(pair) monoResult = str(t.check_monophyly(values=n, ignore_missing=True,target_attr="name",unrooted=True)) if not 'polyphyletic' in monoResult: if not n in cluster: cluster.append(n) for i in n: if i in cluster: cluster.remove(i) if not n in self.monoPairs: self.monoPairs.append(n) #Breaksdown large clusters to identify poly- , para-, and monophyletic sub-clusters while y < 2: y+=1 for pair in itertools.combinations(cluster,2): pairL = list(pair) cluster2 = [] if type(pairL[0]) is list: if type(pairL[1]) is list: cluster2 = pairL[0]+pairL[1] else: for i in pairL[0]: if not i in cluster2: cluster2.append(i) if not pairL[1] in cluster2: cluster2.append(pairL[1]) elif type(pairL[1]) is list: for i in pairL[1]: if not i in cluster2: cluster2.append(i) if not type(pairL[0]) is list: if not pairL[0] in cluster2: cluster2.append(pairL[0]) else: if not pairL[0] in cluster2: cluster2.append(pairL[0]) if not pairL[1] in cluster2: cluster2.append(pairL[1]) monoResult = str(t.check_monophyly(values=cluster2, ignore_missing=True,target_attr="name",unrooted=True)) if not 'polyphyletic' in monoResult: x+=1 if not cluster2 in cluster: cluster.append(cluster2) for item in cluster2: for i in item: if i in cluster: cluster.remove(i) if not cluster2 in self.monoPairs: self.monoPairs.append(cluster2) return self.monoPairs
node.detach() nb_ti = 0 nb_tj = 0 nb_common_clade = 0 for node in ti.traverse("postorder"): if(not node.is_leaf()): nb_ti += 1 leaves = [] for leaf in node: name = leaf.name leaves.append(name.lower()) if len(leaves) != 0: if(tj.check_monophyly(values=leaves, target_attr="name", ignore_missing=True)[0]): nb_common_clade +=1 for node in tj.traverse("postorder"): if( not node.is_leaf()): nb_tj += 1 percent_common_clade = 100.0*nb_common_clade/min(nb_ti,nb_tj) print(percent_common_clade) local_per.append(int(round(percent_common_clade,0))) all_percentage.append(local_per) percentages_per_method = [[]]*10 for i in range(len(percentages_per_method)): l = [] for j in range(len(all_percentage)): l.append(all_percentage[j][i])
target_clade = [] for line in clade: line = line.strip("\n") target_clade.append(line) clade.close() #prune taxa that are not in each gene from target clade pruned_target_clade = [] leaves = [] for leaf in tree: leaves.append(leaf.name) for leaf in leaves: if leaf in target_clade: pruned_target_clade.append(leaf) #check the monophyly mono = tree.check_monophyly(values=pruned_target_clade, target_attr="name") if True in mono: print(args.clade + " is monophyletic in " + args.tree) else: print(args.clade + " is NOT monophyletic in " + args.tree) print("Intruders into " + args.clade + " are:") for leaf in mono[2]: print(leaf)
def analyze_tree(tree, outfile, anno, monoout): """ Main function to analyze the tree :param tree: :param outfile: :param anno: :param monoout: :return: tree file as pdf file """ m = pd.read_csv(group_mapping, sep="\t", header=None, names=['taxon', 'group']) taxon_mapping = dict(zip(m['taxon'], m['group'])) unique_groups = list(set(m['group'])) group_dict = {} for u in unique_groups: group_dict[u] = [] for k, v in taxon_mapping.iteritems(): if taxon_mapping[k] in group_dict: group_dict[taxon_mapping[k]].append(k) adf = pd.read_csv(anno, sep="\t", header=None, names=['gene', 'anno']) anno_dict = dict(zip(adf['gene'], adf['anno'])) title = "unknown gene" if tree.startswith("chloNOG"): gene = '.'.join(tree.split(".")[:3]) if gene in anno_dict: title = gene + " (" + anno_dict[gene] + ")" elif tree.startswith("OC_"): gene = tree.split(".")[0] if gene in anno_dict: title = gene + " (" + anno_dict[gene] + ")" t = Tree(tree) outgroup_to_use = determine_outgroup(t, group_dict) print title if outgroup_to_use[2] == 0: print "None of the outgroups chosen are found in the tree. Rooted with mid-point rooting." r = t.get_midpoint_outgroup() t.set_outgroup(r) elif outgroup_to_use[2] == 1: t.set_outgroup(outgroup_to_use[1][0]) elif outgroup_to_use[2] >= 2: ## do mid-point rooting first to get around the problem of some clades that can't be re-rooted right away r = t.get_midpoint_outgroup() t.set_outgroup(r) x = t.check_monophyly(values=outgroup_to_use[1], target_attr="name") if x[0] == True: print outgroup_to_use[0], "members are monophyletic" ## now, re-root with actual outgroup lca = t.get_common_ancestor(outgroup_to_use[1]) t.set_outgroup(lca) else: ## use mid-point rooting if members are not mono (already mid-point rooted in the outer scope) print outgroup_to_use[ 0], "members are NOT monophyletic. Can't use as an outgroup. Rooted with mid-point rooting" check_monophyly(t, group_dict, monoout) t.ladderize(direction=1) ts = TreeStyle() ns = NodeStyle() ts.show_branch_support = True ts.extra_branch_line_color = "DarkGrey" ts.show_leaf_name = False ts.layout_fn = tree_layout #ts.branch_vertical_margin = 0 ns['shape'] = "square" ns['size'] = 0 ts.title.add_face(TextFace(title, fsize=8), column=0) for n in t.traverse(): n.set_style(ns) t.render(outfile, w=1500, units="px", tree_style=ts) print " "