def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv, "ht:l:r:",
                                   ["tree=", "leaves=", "root="])
    except getopt.GetoptError:
        print 'test.py -t <treefile> -l <leave,leave,etc.> -r <root>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -t <tree> -l <leave,leave,etc.> -r <root>'
            sys.exit()
        elif opt in ("-t", "--tree"):
            tree = arg
        elif opt in ("-l", "--leaves"):
            leaves = arg
        elif opt in ("-r", "--root"):
            root = arg

    list = leaves.split(',')
    t = Tree(tree)
    t.set_outgroup(root)
    print t.check_monophyly(values=list, target_attr="name")
#map tip IDs to domain
tip_to_domain = {}
inh = open("genomemetadata.tsv")
for line in inh:
	fields = re.split("\t", line.rstrip())
	tip_to_domain[fields[0]] = fields[21]
inh.close()

num_arch = 0
num_bact = 0

tree = Tree(target_tree)
for tip in tree:
	the_domain = tip_to_domain[tip.name]
	if the_domain == "Archaea":
		num_arch += 1
	elif the_domain == "Bacteria":
		num_bact += 1
	else:
		print("Problem with " + str(tip.name) + "'s domain assignment.")
	tip.add_feature("domain",the_domain)

print(sys.argv[1] + "\tNum_arch\t" + str(num_arch))
print(sys.argv[1] + "\tNum_bact\t" + str(num_bact))

#now check domain monophyly
arch = tree.check_monophyly(values=["Archaea"], target_attr="domain")[0:2]
bact = tree.check_monophyly(values=["Bacteria"], target_attr="domain")[0:2]
print(sys.argv[1] + "\t" + str(arch[0]) + "\t" + str(arch[1]) + "\t" + str(bact[0]) + "\t" + str(bact[1]))
	
    for j in range(i):
        line += "\t"
    line += "\t" + str(100)
    for j in range(i + 1, len(files)):
        filej = files[j]
        tj = Tree(filej)
        nb_ti = 0
        nb_tj = 0
        nb_common_clade = 0
        for node in ti.traverse("postorder"):
            if (not node.is_leaf()):
                nb_ti += 1
                leaves = []
                for leaf in node:
                    leaves.append(leaf.name)
                if (tj.check_monophyly(values=leaves, target_attr="name")[0]):
                    nb_common_clade += 1
        for node in tj.traverse("postorder"):
            if (not node.is_leaf()):
                nb_tj += 1
        percent_common_clade = 100.0 * nb_common_clade / min(nb_ti, nb_tj)
        line += "\t" + str(round(percent_common_clade, 2))
    stat.write(line + "\n")
stat.close()
print("Statistics on common clades generated in " + str(time.time() - start) +
      " seconds")
print(common_clades_stat_file)

###############################################################################

# Generate statistics on branch supports
Example #4
0
def caluclate_rootstrap(treeFile, bootFile, is_rooted, out_group):
    '''
    Parameters
    ----------
    treeFile: rooted tree in newick format (.treefile in IQ-TREE)
    bootFile: rooted bootstrap trees in newick format (e.g. .ufboot file in IQ-TREE)
    rooted: if the bootstrap trees are rooted (defult is True). If not rooted provide outgroup taxa file
    og: A file with outgroup taxa in Nexus format
    
    Returns
    -------
    rootstrapTree: rooted tree with rootstrap support values as branch lengths in newick format
    '''

    boottrees = []
    trees = []
    polyphyly = 0
    N_boottrees = 0
    if not is_rooted:
        if out_group == None:
            raise SystemExit('Error: Please provide outgroup taxa in Nexus format')
        ML_tree = Tree(treeFile)
        try:
            og = Read_Nex(out_group) #get the outgroup taxa
        except:
            raise SystemExit('Error: Cannot find outgroup taxa')
        if len(og) == 1: #if there is one outgroup taxon use it to root the tree
            ML_root = ML_tree.search_nodes(name=og[0])[0]
        else: #if there are more than one outgroup taxon find their common ancestor
            ML_root = ML_tree.get_common_ancestor(og)
        if not ML_root.is_root():
            ML_tree.set_outgroup(ML_root)
        ingroup = [n.name for n in ML_tree.get_leaves() if n.name not in og]
        try:#check if the ingroup is monophyletic
            if ML_tree.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]:
                ML_tree.prune(ingroup) #prune ingroup taxa only
                rootedMLtree = os.path.splitext(treeFile)[0]+'_rooted.treefile'
                ML_tree.write(outfile=rootedMLtree) #write the rooted ML tree with ingroup taxa only to a file
            else:
                 raise SystemExit('Error: ML ingroup taxa are not monophyletic')
        except:
                    raise SystemExit('Error: ML ingroup taxa are not monophyletic')
                
        with open(bootFile, 'r') as f:
            for tree in f:
                N_boottrees += 1
                t = Tree(tree)
                ingroup = [n.name for n in t.get_leaves() if n.name not in og]
                if len(og) == 1: #if there is one outgroup taxon use it to root the tree
                    root = t.search_nodes(name=og[0])[0]
                elif len(og) > 1: #if there are more than one outgroup taxon find their common ancestor
                    root = t.get_common_ancestor(og)
                else: #if there is no outgroup taxa raise an error
                    raise SystemExit('Error: Please provide outgroup taxa in Nexus format')
                if not root.is_root():
                    t.set_outgroup(root)
                try:#check if the ingroup is monophyletic
                    if t.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]:
                        trees.append(t.write(format=9))
                    else:
                        polyphyly += 1
                except:
                    polyphyly += 1
        
        for tree in trees:
            t = Tree(tree)
            t.prune(ingroup)
            boottrees.append(t.write(format=9))
    else: #If you are using rooted ML tree and rooted bootstrap trees (e.g. NR model)
        ML_tree = Tree(treeFile)
        with open(bootFile, 'r') as f:
            for tree in f:
                N_boottrees += 1
                t = Tree(tree)
                boottrees.append(t.write(format=9))

    booted = [(g[0], len(list(g[1]))) for g in ite.groupby(boottrees)] #a list of all unique bootstrap trees with thier number of occurrence
    boottrees = []
    for b in booted:
        t2 = Tree(b[0])
        x = []
        for n in t2.traverse():
            if n.is_root():
                for child in n.children:
                    if child.is_leaf():
                        x.append([child.name])
                    else:
                        x.append([i.name for i in child.get_descendants()])
                boottrees.append([b[1],x])
    if is_rooted:
        roots = all_possible_roots(treeFile)
    else:
        roots = all_possible_roots(rootedMLtree)

    rootstrap_value = dict.fromkeys(roots.keys(), 0)
    for node, rooted in roots.items():
        t1 = Tree(rooted)
        x = []
        for n in t1.traverse():
            if n.is_root():
                for child in n.children:
                    if child.is_leaf():
                        x.append([child.name])
                    else:
                        x.append([i.name for i in child.get_descendants()])
        y = [set(i) for i in x]
        for split in boottrees:
            z = [set(i) for i in split[1]]
            if len(y) == len(z):
                for group in y:
                    if group in z:
                        z.remove(group)
            if len(z) == 0:
                rootstrap_value[node] += split[0]/N_boottrees
            else:
                rootstrap_value[node] += 0

    if is_rooted:
        t = Tree(treeFile)
    else:
        t = Tree(rootedMLtree)
    
    k = 1
    for n in t.traverse():
        if not n.is_root():
            if not n.is_leaf():
                n.add_features(name='n'+str(k))
                n.add_features(rootstrap=rootstrap_value[n.name]*100)
                k += 1
            else:
                n.add_features(rootstrap=rootstrap_value[n.name]*100)
    temp = os.path.splitext(treeFile)[0]+'.temp'
    rootstrapTree = os.path.splitext(treeFile)[0]+'.rootstrap'
    t.write(outfile=temp, features =["rootstrap"])
    x = dendropy.Tree.get(path=temp, schema='newick')
    x.write(path=rootstrapTree, schema='nexus')
    os.remove(temp)
    return polyphyly
Example #5
0
        leaf.add_features(domain="Eukaryote")
        eukaryote_seqs.append(leaf.name)
        target_leaf = leaf
    else:
        leaf.add_features(domain="Other")
#print eukaryote_seqs
#test the various phylogenetic criteria for LGT.

#euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree
if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate
    print sys.argv[1] + "\tSingleton"
#euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree
#print len(eukaryote_seqs)
else:
    try:
        answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name")
        if answer[0] == True:
            ca = tree.get_common_ancestor(eukaryote_seqs)
            print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) 
        elif answer[0] == False:
            mono_groups = []
            target_group = ''
            for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"):
                if target_leaf in node:
                    target_group = node
                else:
                    mono_groups.append(node)
            size_target_group = len(target_group)
            #get distance
            shortest_distance = 999999999999999.0
            closest_other_group = ''
Example #6
0
tree = Tree(sys.argv[1])
print tree

archaea = [] #make a list of archaea that are in the tree
bacteria = []
#check the domain of each taxon in the tree
for taxon in tree:
	print taxon.name + "\t" + id_to_domain[taxon.name]
	if id_to_domain[taxon.name] == 'Archaea':
		archaea.append(taxon.name)
	else:
		bacteria.append(taxon.name)

#first, check if archaea are monophyletic in the tree

if tree.check_monophyly(values=archaea, target_attr="name")[0] == True:

	#find the branch separating archaea and bacteria, and reroot the tree on that
	archaea_ancestor = tree.get_common_ancestor(archaea) 
	tree.set_outgroup(archaea_ancestor)
elif tree.check_monophyly(values=bacteria, target_attr="name")[0] == True:
	bacteria_ancestor = tree.get_common_ancestor(bacteria)
	tree.set_outgroup(bacteria_ancestor)
else:
	#neither archaea nor bacteria were monophyletic, so print some error and quit
	print sys.argv[1] + ": neither A nor B monophyletic."
	quit()

outfile_name = sys.argv[1] + "_rerooted"
tree.write(outfile=outfile_name)
class ClusterIdentification(object):
    def __init__(self):
        self.PercentileThreshold = {}
        self.dictSharedReads = {}
        self.dictClusters = {}
        self.monoFinalRes = []
        self.count = 0
        self.SerialNodes = {}
        self.t = Tree(TreeFile)
        self.nodesRemoved = []
        self.nodecheck = []

    ##The Split method identifies the percentile threshold for each sample from the results of PatDistSpectrum.py
    ##This threshold is determined from user input in the command line
    def Split(self, infile):

        Percentiles = {
            "0": "1",
            "1": "2",
            "5": "3",
            "10": "4",
            "20": "5",
            "25": "6",
            "30": "7",
            "35": "8",
            "40": "9",
            "45": "10",
            "50": "11",
            "75": "12",
            "90": "13",
            "99": "14",
            "100": "15",
        }
        with open(Spectrum, "r") as file1:
            for line in file1:
                if not "Samples" in line:
                    linerep = line.replace(" ", "")
                    if percentile in Percentiles:
                        cutoff = Percentiles[percentile]
                    else:
                        sys.stdout.write(
                            "Please specify the percentile as a number (0,1,5,10,20,25,30,35,40,50,75,90,100)"
                        )
                        sys.exit(1)
                    linesp = linerep.rstrip("\n").split("\t")
                    nodes = linesp[0]
                    nodesSp = nodes.split("__")
                    if nodesSp[0] == nodesSp[1]:
                        combNode = nodesSp[0] + "__" + nodesSp[1]
                        self.PercentileThreshold[combNode] = linesp[int(cutoff)]

        return self.PercentileThreshold

    # Identifies all variants passing the threshold defined in the Split method
    def variantCollection(self):

        PatDistSpec = self.Split(Spectrum)

        with open(PatDistOutput, "r") as file2:
            for line in file2:
                linesp = line.split(",")
                node = linesp[0]
                nodeshort = node[idStart:idLen]
                nodedouble = nodeshort + "__" + nodeshort
                mateshort = linesp[1][idStart:idLen]
                matedouble = mateshort + "__" + mateshort
                mate = linesp[1]
                patdist = linesp[2]
                support = linesp[3].rstrip("\n")
                comb = nodeshort + "__" + mateshort
                comb2 = mateshort + "__" + nodeshort
                # Store variants that are below the respective pat dist threshold defined in PatDistSpec
                if nodeshort != mateshort:
                    if float(PatDistSpec[nodedouble]) <= float(PatDistSpec[matedouble]):
                        target = float(PatDistSpec[nodedouble])
                    else:
                        target = float(PatDistSpec[matedouble])
                    if float(patdist) <= float(target):
                        if str(supportInput) == "PASS":
                            if not comb2 in self.dictSharedReads:
                                if not comb in self.dictSharedReads:
                                    self.dictSharedReads[comb] = []
                                if not node in self.dictSharedReads[comb]:
                                    self.dictSharedReads[comb].append(node)
                                if not mate in self.dictSharedReads[comb]:
                                    self.dictSharedReads[comb].append(mate)
                            else:
                                if not node in self.dictSharedReads[comb2]:
                                    self.dictSharedReads[comb2].append(node)
                                if not mate in self.dictSharedReads[comb2]:
                                    self.dictSharedReads[comb2].append(mate)
                        elif not support == "None":
                            if float(supportInput) <= float(support):
                                if not comb2 in self.dictSharedReads:
                                    if not comb in self.dictSharedReads:
                                        self.dictSharedReads[comb] = []
                                    if not node in self.dictSharedReads[comb]:
                                        self.dictSharedReads[comb].append(node)
                                    if not mate in self.dictSharedReads[comb]:
                                        self.dictSharedReads[comb].append(mate)
                                else:
                                    if not node in self.dictSharedReads[comb2]:
                                        self.dictSharedReads[comb2].append(node)
                                    if not mate in self.dictSharedReads[comb2]:
                                        self.dictSharedReads[comb2].append(mate)

    ##Identifying potential outliers is optional (-oR flag from the command line )
    # Based on the retrieve common ancestor function, it identifies outliers as those which contain < 3 intra-variants associated with a given sample
    def PhylyOutlierRem(self, n, node1, node2, OutlierFile, idStart, idLen):
        PhyloOutliers = {}
        ancestorList = []
        ancshort = []
        node1short = node1[idStart:idLen]
        node2short = node2[idStart:idLen]
        nodecomb = node1short + "__" + node2short
        nodecombRev = node2short + "__" + node2short

        if not nodecomb or not nodecombRev in self.monoFinalRes:
            if not node1 in self.nodesRemoved:
                if not node2 in self.nodesRemoved:
                    # Collect all common ancestors for each pair of variants
                    ancestor = self.t.get_common_ancestor(n)
                    for i in ancestor:
                        ancestorList.append(i.name)
                        ancestorShort = i.name[idStart:idLen]
                        if not ancestorShort in PhyloOutliers:
                            PhyloOutliers[ancestorShort] = []
                        PhyloOutliers[ancestorShort].append(1)

                    # Sum the variants for each sample, if < 3, store variant as outlier
                    for k, v in PhyloOutliers.iteritems():
                        vsum = sum(v)
                        if vsum < 3:
                            for item in ancestorList:
                                if item[idStart:idLen] == k:
                                    if not item in self.nodesRemoved:
                                        if node1short in self.SerialNodes:
                                            if not node2short in self.SerialNodes[node1short]:
                                                ancestorList.remove(item)
                                                self.nodesRemoved.append(item)
                                        elif node2short in self.SerialNodes:
                                            if not node1short in self.SerialNodes[node2short]:
                                                ancestorList.remove(item)
                                                self.nodesRemoved.append(item)
                                        else:
                                            ancestorList.remove(item)
                                            self.nodesRemoved.append(item)
            for i in self.nodesRemoved:
                if not i in self.nodecheck:
                    try:
                        item = self.t.search_nodes(name=item)[0]
                        i.delete()
                        self.nodecheck.append(i)
                    except:
                        pass

        return ancestorList

    # Create all combinations of intrahost sample identifiers for each respective sequential sample set
    # These results are used to assist in PhylyOutlierRem
    def intraComb(self, infile):

        with open(IntraFile) as f:
            for line in f:
                line = line.rstrip("\n")
                linesp = line.split(",")

                length = len(linesp)
                comb = int(length)
                for i in linesp:
                    self.SerialNodes[i] = []
                    for pair in itertools.combinations(linesp, 2):

                        for item in pair:
                            if i != item:
                                if not item in self.SerialNodes[i]:
                                    self.SerialNodes[i].append(item)

        return self.SerialNodes

    # First step of merging overlapping pairs of connected samples
    def ClusterKeys(self, values, node1, node2):

        if node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                for k, v in self.dictClusters.iteritems():
                    if node1 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)

        if node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
            if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
                for k, v in self.dictClusters.iteritems():
                    if node2 in self.dictClusters[k]:
                        self.dictClusters[k].append(node1)

        if node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                for k, v in self.dictClusters.iteritems():
                    if node1 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)
                        self.dictClusters[k].append(node1)

                    if node2 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)
                        self.dictClusters[k].append(node1)

        if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                self.count += 1
                if not self.count in self.dictClusters:
                    self.dictClusters[self.count] = []
                self.dictClusters[self.count].append(node1)
                self.dictClusters[self.count].append(node2.rstrip("\n"))

    # Second step of merging overlapping pairs of connected samples
    def ClusterKeys2(self, dictClusters):
        Clustvals = {}
        sysvers = str(sys.version_info[0]) + "." + str(sys.version_info[1])
        if float(sysvers) == 2.7:
            ##For python 2.7
            Clustvals = {k: set(val) for k, val in self.dictClusters.items()}
        elif float(sysvers) == 2.6:
            ##For python 2.6
            Clustvals = dict((k, val) for (k, val) in self.dictClusters.items())
        merged = set()
        srt = sorted(self.dictClusters.keys())
        srt2 = srt[:]
        for key in srt:
            for k in srt2:
                if not k == key:
                    if Clustvals[k].intersection(self.dictClusters[key]) and key not in merged:
                        merged.add(k)
                        self.dictClusters[key] = list(Clustvals[k].union(self.dictClusters[key]))
                        srt2.remove(k)
        for k in merged:
            del self.dictClusters[k]
        try:
            if len(self.dictClusters) > 0:
                del self.dictClusters[0]
        except:
            pass
        ValLengths = []
        ItemNumber = []

        for k, v in self.dictClusters.iteritems():
            ValLengths.append(int(len(set(v))))
            for i in v:
                if not i in ItemNumber:
                    ItemNumber.append(i)
        ValLengths[:] = []
        for k, v in self.dictClusters.iteritems():
            vset = set(v)
            v[:] = []
            vset = list(vset)
            self.dictClusters[k] = str(vset)
        return self.dictClusters

    # Retrieve common ancestors
    def CommonAncestor(self, nodes):
        ancestors = []
        ancestor = self.t.get_common_ancestor(nodes)
        for i in ancestor:
            ancestors.append(i.name)
        return ancestors

    # Identify poly- , para-, and monophyletic pairs of variants
    def CheckMono(self, ncomb, PhyloVarRemoval, Rejects, monoFinal):
        monoResult = str(
            self.t.check_monophyly(values=PhyloVarRemoval, ignore_missing=True, target_attr="name", unrooted=True)
        )
        monoResultSp = monoResult.split(",")
        mR = monoResultSp[1].replace("'", "").replace(")", "").replace(" ", "")
        if "monophyletic" in mR:
            if not ncomb in monoFinal:
                monoFinal[ncomb] = []
            monoFinal[ncomb].append(mR)
            if not ncomb in self.monoFinalRes:

                self.monoFinalRes.append(ncomb)

            return True
        elif "paraphyletic" in mR:
            if not ncomb in monoFinal:
                monoFinal[ncomb] = []
            monoFinal[ncomb].append(mR)
            if not ncomb in self.monoFinalRes:

                self.monoFinalRes.append(ncomb)
        elif not ncomb in Rejects:

            Rejects.append(ncomb)

    ##Analysis identifies all ancestors to variants passing the required percentile thresholds
    # Following the removal of outliers, it parses through every combination of these variants to determine whether the pair is polyphyletic or not
    def variantAnalysis(self):
        monoFinal = {}
        self.variantCollection()
        if outlierFlag == "TRUE":
            OutlierFile = open(outputPath + TreeShort + "." + percentile + "." + supportInput + ".Outlier.txt", "w")
        try:
            self.intraComb(IntraFile)
        except:
            pass
        Rejects = []

        for k, v in self.dictSharedReads.iteritems():

            x = 0
            count = 0
            ksp = k.split("__")
            krev = ksp[1] + "__" + ksp[0]
            FinalList = []
            clusters = self.dictSharedReads[k]

            for pair in itertools.combinations(clusters, 2):
                n = list(pair)
                node1 = n[0]
                node2 = n[1]
                if not node1 in self.nodesRemoved:
                    if not node2 in self.nodesRemoved:
                        nodeList = []
                        node1short = str(pair)[(idStart + 2) : (idLen + 2)]
                        pairSp = str(pair).split(",")
                        node2short = pairSp[1].replace(" ", "").replace("'", "")[idStart:idLen]
                        nshort = [node1short, node2short]
                        ncomb = node1short + "__" + node2short
                        ncombRev = node2short + "__" + node1short
                        if node1short != node2short:
                            if not ncomb or not ncombRev in self.monoFinalRes:
                                if outlierFlag == "TRUE":
                                    PhyloVarRemoval = self.PhylyOutlierRem(n, node1, node2, OutlierFile, idStart, idLen)
                                else:
                                    PhyloVarRemoval = self.CommonAncestor(n)

                                for i in PhyloVarRemoval:
                                    node = i[idStart:idLen]
                                    if not node in nodeList:
                                        nodeList.append(node)
                        if not node1 in self.nodesRemoved:
                            if not node2 in self.nodesRemoved:
                                if len(nodeList) == 2:
                                    if not ncomb or not ncombRev in self.monoFinalRes:

                                        if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                            break
                                elif len(nodeList) > 2:
                                    monoPos = 0
                                    lengthNode = len(nodeList)
                                    flag = 0
                                    nodeCheck = 0
                                    nodeRemoval = []

                                    for i in set(nodeList):
                                        if not i in nshort:
                                            if not i in self.SerialNodes:
                                                nodeRemoval.append(i)
                                                flag = 1
                                    if flag == 0:
                                        if len(PhyloVarRemoval) > 1:
                                            if not ncomb or not ncombRev in self.monoFinalRes:
                                                if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                                    break
                                    else:
                                        for item in nodeRemoval:
                                            nodeRemovalShort = item[idStart:idLen]
                                            if not nodeRemovalShort + "__" + node1short in self.dictSharedReads.keys():
                                                if (
                                                    not node1short + "__" + nodeRemovalShort
                                                    in self.dictSharedReads.keys()
                                                ):
                                                    if (
                                                        not node2short + "__" + nodeRemovalShort
                                                        in self.dictSharedReads.keys()
                                                    ):
                                                        if (
                                                            not nodeRemovalShort + "__" + node2short
                                                            in self.dictSharedReads.keys()
                                                        ):
                                                            for i in PhyloVarRemoval:
                                                                nodeShort = i[idStart:idLen]
                                                                if nodeShort in nodeRemoval:
                                                                    PhyloVarRemoval.remove(i)
                                        if len(PhyloVarRemoval) > 1:
                                            if not ncomb in self.monoFinalRes:
                                                if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                                    break
                                        flag = 0
        if outlierFlag == "TRUE":
            for i in set(self.nodesRemoved):
                OutlierFile.write("%s\n" % i)

        self.dictClusters = {}
        self.count = 0
        for i in self.monoFinalRes:
            if not "polyphyletic" in i:
                if not self.count in self.dictClusters:
                    self.dictClusters[self.count] = []
                values = self.dictClusters.values()

                isp = i.split("__")
                node1 = isp[0]
                node2 = isp[1].split("\t")[0]
                ClusterKeys = self.ClusterKeys(values, node1, node2)
        try:
            FinalClustering = self.ClusterKeys2(ClusterKeys)
        except:
            print "WARNING: Patristic Distance Data files may be empty..."
            sys.exit(1)
        for k, v in monoFinal.iteritems():
            print k + "\t" + str(v)
        print "Clusters that are polyphyletic: " + str(Rejects)
        return FinalClustering
tree = Tree(sys.argv[1])
print tree

archaea = []  #make a list of archaea that are in the tree
bacteria = []
#check the domain of each taxon in the tree
for taxon in tree:
    print taxon.name + "\t" + id_to_domain[taxon.name]
    if id_to_domain[taxon.name] == 'Archaea':
        archaea.append(taxon.name)
    else:
        bacteria.append(taxon.name)

#first, check if archaea are monophyletic in the tree

if tree.check_monophyly(values=archaea, target_attr="name")[0] == True:

    #find the branch separating archaea and bacteria, and reroot the tree on that
    archaea_ancestor = tree.get_common_ancestor(archaea)
    tree.set_outgroup(archaea_ancestor)
elif tree.check_monophyly(values=bacteria, target_attr="name")[0] == True:
    bacteria_ancestor = tree.get_common_ancestor(bacteria)
    tree.set_outgroup(bacteria_ancestor)
else:
    #neither archaea nor bacteria were monophyletic, so print some error and quit
    print sys.argv[1] + ": neither A nor B monophyletic."
    quit()

outfile_name = sys.argv[1] + "_rerooted"
tree.write(outfile=outfile_name)
#read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
#might need to alter taxonomy assignment so that we check for the presence of the believed groups at all levels of the taxonomy.
ml_tree = Tree(sys.argv[1])
for leaf in ml_tree:
    taxonomy = check_for_favourite_taxonomy(leaf.name)
    taxa_names.append(leaf.name)
    leaf.add_feature("tax", taxonomy) #this needs to label with the favoured group, or else "none" or something. TODO.
    if taxonomy == "none":
        continue
    else:
        labels[taxonomy] = 1
groups = labels.keys()
#need to add something above to get a list of the believed labels which are actually found in the tree. For the moment, we'll use groups (=labels.keys()).
#for each of our favourite believed groups, ask whether all sequences from that group are monophyletic.

total_believed_groups = len(groups)
mono_believed_groups = 0
for label in groups:
    val = ml_tree.check_monophyly(values=[label], target_attr="tax", unrooted=True)
    #print(val)
    print(label + "\t" + str(val[0]) + "\t" + str(val[1]))
    if val[0] == True:
        mono_believed_groups += 1
    else:
        for ele in val[2]:
            print(ele.get_ascii())
    #    mono_believed_groups += 1
    #    print(label)

print(sys.argv[1] + " score: " + str(float(mono_believed_groups)/float(total_believed_groups)))
 def CheckMonophyly(self,PDlist):
     t = Tree(filePath+TreeFile)
     monoShort=[]
     x=0
     for item in PDlist:
         cluster=[]
         pairL=[]
         flag = 0
         y=0
         clusterRaw=str(item).replace("[","").replace("]","").replace('"',"").replace(" ","").replace("'",'').split(',')
         for i in clusterRaw:
             if not i in cluster:
                 cluster.append(i)
         monoResult = str(t.check_monophyly(values=cluster, ignore_missing=True,target_attr="name",unrooted=True))
         #Identify poly- , para-, and monophyletic relationships for clusters
         if 'monophyletic' in monoResult:
             for pair in itertools.combinations(cluster,2):
                 m = list(pair)
                 if not m in self.monoPairs:
                     self.monoPairs.append(m)
                 
         elif 'paraphyletic' in monoResult:
             for pair in itertools.combinations(cluster,2):
                 m = list(pair)
                 if not m in self.monoPairs:
                     self.monoPairs.append(m)
         else:
             cluster2 = []
             for pair in itertools.combinations(cluster,2):
                 n = list(pair)  
                 monoResult = str(t.check_monophyly(values=n, ignore_missing=True,target_attr="name",unrooted=True))
                 if not 'polyphyletic' in monoResult:
                     if not n in cluster:
                         cluster.append(n)
                         for i in n:
                             if i in cluster:
                                 cluster.remove(i)
                     if not n in self.monoPairs:
                         self.monoPairs.append(n)
             #Breaksdown large clusters to identify poly- , para-, and monophyletic sub-clusters
             while y < 2:
                 y+=1
                 for pair in itertools.combinations(cluster,2):
                     pairL = list(pair)
                     cluster2 = []
                     if type(pairL[0]) is list:
                         if type(pairL[1]) is list:
                             cluster2 = pairL[0]+pairL[1]
                         else:
                             for i in pairL[0]:
                                 if not i in cluster2:
                                     cluster2.append(i)
                             if not pairL[1] in cluster2:
                                 cluster2.append(pairL[1])
                     elif type(pairL[1]) is list:
                         for i in pairL[1]:
                             if not i in cluster2:
                                 cluster2.append(i)
                         if not type(pairL[0]) is list:
                             if not pairL[0] in cluster2:
                                 cluster2.append(pairL[0])
                     else:
                         if not pairL[0] in cluster2:
                             cluster2.append(pairL[0])
                         if not pairL[1] in cluster2:
                             cluster2.append(pairL[1])
                     monoResult = str(t.check_monophyly(values=cluster2, ignore_missing=True,target_attr="name",unrooted=True))
                     if not 'polyphyletic' in monoResult:
                         x+=1
                         if not cluster2 in cluster:
                             cluster.append(cluster2)
                         for item in cluster2:
                             for i in item:
                                 if i in cluster:
                                     cluster.remove(i)
                         if not cluster2 in self.monoPairs:
                             self.monoPairs.append(cluster2)
     return self.monoPairs
					node.detach()

		nb_ti = 0
		nb_tj = 0
		nb_common_clade = 0
		for node in ti.traverse("postorder"):
			if(not node.is_leaf()):
				nb_ti += 1
				leaves = []

				for leaf in node:
					name = leaf.name
					leaves.append(name.lower())

				if len(leaves) != 0:
					if(tj.check_monophyly(values=leaves, target_attr="name", ignore_missing=True)[0]):
						nb_common_clade +=1
		for node in tj.traverse("postorder"):
			if( not node.is_leaf()):
				nb_tj += 1
		percent_common_clade = 100.0*nb_common_clade/min(nb_ti,nb_tj)
		print(percent_common_clade)
		local_per.append(int(round(percent_common_clade,0)))
	all_percentage.append(local_per)


percentages_per_method = [[]]*10	
for i in range(len(percentages_per_method)):
	l = []
	for j in range(len(all_percentage)):
		l.append(all_percentage[j][i])
Example #12
0
target_clade = []

for line in clade:
    line = line.strip("\n")
    target_clade.append(line)

clade.close()

#prune taxa that are not in each gene from target clade
pruned_target_clade = []

leaves = []

for leaf in tree:
    leaves.append(leaf.name)

for leaf in leaves:
    if leaf in target_clade:
        pruned_target_clade.append(leaf)

#check the monophyly
mono = tree.check_monophyly(values=pruned_target_clade, target_attr="name")

if True in mono:
    print(args.clade + " is monophyletic in " + args.tree)
else:
    print(args.clade + " is NOT monophyletic in " + args.tree)
    print("Intruders into " + args.clade + " are:")
    for leaf in mono[2]:
        print(leaf)
Example #13
0
def analyze_tree(tree, outfile, anno, monoout):
    """
    Main function to analyze the tree
    :param tree:
    :param outfile:
    :param anno:
    :param monoout:
    :return: tree file as pdf file
    """
    m = pd.read_csv(group_mapping,
                    sep="\t",
                    header=None,
                    names=['taxon', 'group'])
    taxon_mapping = dict(zip(m['taxon'], m['group']))
    unique_groups = list(set(m['group']))

    group_dict = {}
    for u in unique_groups:
        group_dict[u] = []
    for k, v in taxon_mapping.iteritems():
        if taxon_mapping[k] in group_dict:
            group_dict[taxon_mapping[k]].append(k)

    adf = pd.read_csv(anno, sep="\t", header=None, names=['gene', 'anno'])
    anno_dict = dict(zip(adf['gene'], adf['anno']))
    title = "unknown gene"
    if tree.startswith("chloNOG"):
        gene = '.'.join(tree.split(".")[:3])
        if gene in anno_dict:
            title = gene + " (" + anno_dict[gene] + ")"
    elif tree.startswith("OC_"):
        gene = tree.split(".")[0]
        if gene in anno_dict:
            title = gene + " (" + anno_dict[gene] + ")"

    t = Tree(tree)
    outgroup_to_use = determine_outgroup(t, group_dict)

    print title

    if outgroup_to_use[2] == 0:
        print "None of the outgroups chosen are found in the tree. Rooted with mid-point rooting."
        r = t.get_midpoint_outgroup()
        t.set_outgroup(r)
    elif outgroup_to_use[2] == 1:
        t.set_outgroup(outgroup_to_use[1][0])
    elif outgroup_to_use[2] >= 2:
        ## do mid-point rooting first to get around the problem of some clades that can't be re-rooted right away
        r = t.get_midpoint_outgroup()
        t.set_outgroup(r)
        x = t.check_monophyly(values=outgroup_to_use[1], target_attr="name")
        if x[0] == True:
            print outgroup_to_use[0], "members are monophyletic"
            ## now, re-root with actual outgroup
            lca = t.get_common_ancestor(outgroup_to_use[1])
            t.set_outgroup(lca)
        else:
            ## use mid-point rooting if members are not mono (already mid-point rooted in the outer scope)
            print outgroup_to_use[
                0], "members are NOT monophyletic. Can't use as an outgroup. Rooted with mid-point rooting"

    check_monophyly(t, group_dict, monoout)

    t.ladderize(direction=1)
    ts = TreeStyle()
    ns = NodeStyle()
    ts.show_branch_support = True
    ts.extra_branch_line_color = "DarkGrey"
    ts.show_leaf_name = False
    ts.layout_fn = tree_layout
    #ts.branch_vertical_margin = 0
    ns['shape'] = "square"
    ns['size'] = 0
    ts.title.add_face(TextFace(title, fsize=8), column=0)
    for n in t.traverse():
        n.set_style(ns)
    t.render(outfile, w=1500, units="px", tree_style=ts)
    print " "