def get_example_tree(): # Set dashed blue lines in all leaves nst1 = NodeStyle() nst1["bgcolor"] = "LightSteelBlue" nst2 = NodeStyle() nst2["bgcolor"] = "Moccasin" nst3 = NodeStyle() nst3["bgcolor"] = "DarkSeaGreen" nst4 = NodeStyle() nst4["bgcolor"] = "Khaki" t = Tree("((((a1,a2),a3), ((b1,b2),(b3,b4))), ((c1,c2),c3));") for n in t.traverse(): n.dist = 0 n1 = t.get_common_ancestor("a1", "a2", "a3") n1.set_style(nst1) n2 = t.get_common_ancestor("b1", "b2", "b3", "b4") n2.set_style(nst2) n3 = t.get_common_ancestor("c1", "c2", "c3") n3.set_style(nst3) n4 = t.get_common_ancestor("b3", "b4") n4.set_style(nst4) ts = TreeStyle() ts.layout_fn = layout ts.show_leaf_name = False ts.mode = "c" ts.root_opening_factor = 1 return t, ts
def get_example_tree(): # Set dashed blue lines in all leaves nst1 = NodeStyle() nst1["bgcolor"] = "LightSteelBlue" nst2 = NodeStyle() nst2["bgcolor"] = "Moccasin" nst3 = NodeStyle() nst3["bgcolor"] = "DarkSeaGreen" nst4 = NodeStyle() nst4["bgcolor"] = "Khaki" t = Tree("( 🌲,( 🥑,(( 🌷, ( 🌴, ( 🍌, ( 🍍, ( 🌽, ( 🎋, 🌾 )))))),(( 🍇, ((( 🥜, ☘️), ((( 🌹, 🍓 ), (( 🍎, 🍐 ), ( 🍑, (🌸, 🍒) ))), ( 🌰, ( 🎃, ( 🍉, ( 🥒, 🍈)))))), (( 🌺, 🥦 ), (( 🍊, 🍋 ), ( 🍁, 🥭))))),( 🌵, ( 🥝, (( 🍠, ( 🌶️, (🍆, ( 🥔, 🍅)))), ( 🥕,( 🥬, ( 🌻, 🌼)))))))));") for n in t.traverse(): n.dist = 0 n1 = t.get_common_ancestor("a1", "a2", "a3") n1.set_style(nst1) n2 = t.get_common_ancestor("b1", "b2", "b3", "b4") n2.set_style(nst2) n3 = t.get_common_ancestor("c1", "c2", "c3") n3.set_style(nst3) n4 = t.get_common_ancestor("b3", "b4") n4.set_style(nst4) ts = TreeStyle() ts.layout_fn = layout ts.show_leaf_name = False ts.mode = "c" ts.root_opening_factor = 1 return t, ts
def smart_reroot(treefile, outgroupfile, outfile, format=0): """ simple function to reroot Newick format tree using ete2 Tree reading format options see here: http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees """ tree = Tree(treefile, format=format) leaves = [t.name for t in tree.get_leaves()][::-1] outgroup = [] for o in must_open(outgroupfile): o = o.strip() for leaf in leaves: if leaf[:len(o)] == o: outgroup.append(leaf) if outgroup: break if not outgroup: print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr) return treefile try: tree.set_outgroup(tree.get_common_ancestor(*outgroup)) except ValueError: assert type(outgroup) == list outgroup = outgroup[0] tree.set_outgroup(outgroup) tree.write(outfile=outfile, format=format) logging.debug("Rerooted tree printed to {0}".format(outfile)) return outfile
def rename_model(target, model, accelerated_genomes): """Iteratively rename each ancestor of accelerated_genomes, walking down the tree to each ancestor""" new_model = os.path.join(target.getGlobalTempDir(), 'region_specific_conserved_subtree.mod') lines = open(model).readlines() t = Tree(lines[-1].split('TREE: ')[1], format=1) # this model may not have all of the genomes, if they were not aligned in this region accelerated_genomes = list( set(t.get_leaf_names()) & set(accelerated_genomes)) if len(accelerated_genomes) > 1: anc = t.get_common_ancestor(accelerated_genomes) nodes = anc.get_descendants() leaves = anc.get_leaves() internal_nodes = [x for x in nodes if x not in leaves] for n in [anc] + internal_nodes: oldest_name = [x.name for x in n.get_children() if x.name != '1'] if len(oldest_name) == 1: n.name = oldest_name[0] + '_Anc' else: n.name = '_'.join(oldest_name) with open(new_model, 'w') as outf: for l in lines[:-1]: outf.write(l) outf.write('TREE: ' + t.write(format=1) + '\n') yield n.name, new_model n.name = '1' else: # only one accelerated genome here -- get common ancestor above will return root node with open(new_model, 'w') as outf: for l in lines[:-1]: outf.write(l) outf.write('TREE: ' + t.write(format=1) + '\n') yield accelerated_genomes[0], new_model
def remove_outgroups(self, ognames, remove=False): """reroot using outgroups and remove them""" self.reroot = False try: if remove: for og in ognames: self.taxa_order.remove(og) self.numtaxa = len(self.taxa_order) for i in range(len(self.trees)): t = Tree(self.trees[i]) if len(ognames) < 2: t.set_outgroup(ognames[0]) if remove: t.prune(self.taxa_order, preserve_branch_length=True) else: ancestor = t.get_common_ancestor(ognames) if not t == ancestor: t.set_outgroup(ancestor) if remove: t.prune(self.taxa_order, preserve_branch_length=True) self.trees[i] = t.write() except ValueError as e: print(e) print( "\n Somthing is wrong with the input outgroup names \n Quiting ..." ) sys.exit()
def make_newick_tree(marker_fasta, tree_outfile,reference): ## using muscle # make the alignment file temp_align = '.'.join(marker_fasta.split('.')[:-1]) + '.aln' cm1 ="muscle -in "+marker_fasta+ " -out "+temp_align os.system(cm1) #make the tree using clustal cm2 ="clustalw -infile="+temp_align+" -tree=1" # have to wait for few second for the aln file actually comes out lol os.system(cm2) temp_tree = '.'.join(marker_fasta.split('.')[:-1]) + '.ph' # that's what this file gets named by default, and i'm sick of looking for the cmd line arg to fix. print(temp_tree) print("modifying") #modify for negative branch modify_tree = '.'.join(marker_fasta.split('.')[:-1]) + '.new' cm3 = "sed -e 's,:-[0-9\.]\+,:0.0,g' "+temp_tree+" > "+modify_tree os.system(cm3) if reference == 'NC_000913': t= Tree(modify_tree) ancestor = t.get_common_ancestor("Campylobacter_jejuni_NC_002163","Nitrosomonas_europaea_NC_004757") t.set_outgroup(ancestor) t.write(format = 1, outfile = modify_tree) # dealing with negative branch length #print "marker_fasta",marker_fasta #print "temp_tree", temp_tree # move the created tree file to the location i say its going shutil.copy(modify_tree, tree_outfile)
def root_tree(tree_path, species_path, output_name): with open(species_path, 'r') as f: species_list = f.read().splitlines() tree = Tree(tree_path) ancestor = tree.get_common_ancestor(species_list) tree.set_outgroup(ancestor) tree.write(outfile=output_name, format=1)
def main(argv): start = time.time() style1 = NodeStyle() style1["fgcolor"] = "#0f0f0f" style1["size"] = 0 #style1["vt_line_color"] = "#ff0000" style1["hz_line_color"] = "#ff0000" #style1["vt_line_width"] = 2 style1["hz_line_width"] = 2 #style1["vt_line_type"] = 2 # 0 solid, 1 dashed, 2 dotted #style1["hz_line_type"] = 2 style2 = NodeStyle() style2["fgcolor"] = "#0f0f0f" style2["size"] = 0 style2["vt_line_color"] = "#ff0000" #style2["hz_line_color"] = "#ff0000" style2["vt_line_width"] = 2 #style2["hz_line_width"] = 2 style2["vt_line_type"] = 2 # 0 solid, 1 dashed, 2 dotted #style2["hz_line_type"] = 2 tree1 = Tree(str(argv[1])) save = int(argv[3]) leafs = argv[2] leafs = leafs.replace("(", "") leafs = leafs.replace(")", "") leafs = leafs.replace("'", "") leafs = leafs.replace(" ", "") q = leafs.split(',') #tree2 = _Tree(str(arg2)) se = tree1.get_common_ancestor(q) for n in q: print(n) node = tree1 & n while (node.up != se): node.img_style = style1 node = node.up #n.img_style = style2 node.img_style = style1 ts = TreeStyle() ts.show_leaf_name = True if (save == 1): if os.path.exists("crud/static/crud/Tree1.png"): os.remove("crud/static/crud/Tree1.png") tree1.render("crud/static/crud/Tree1.png", tree_style=ts) else: if os.path.exists("crud/static/crud/Tree2.png"): os.remove("crud/static/crud/Tree2.png") tree1.render("crud/static/crud/Tree2.png", tree_style=ts) return True
def main(tree_path, species_path): tree = Tree(tree_path, format=1) with open(species_path, 'r') as f: species = f.read().splitlines() #get the first internal node grouping all given species common_ancestor = tree.get_common_ancestor(species) return common_ancestor
def add_section_annotations(tree: Tree) -> None: """Annotates taxonomic sections. Pretty hacky. Finds first common ancestor of leaf nodes per section, then sets a bgcolor. If a section contains a single node, then only that node is styled. Also adds a section label, but exact position is determined by which node gets found first using search_nodes(). Relies on accurate section annotation - FP strains were set to Talaromyces which breaks this. """ leaves = tree.get_leaf_names() sections = defaultdict(list) for strain in session.query(Strain).filter(Strain.id.in_(leaves)): if "FP" in strain.species.epithet: continue sections[strain.species.section.name].append(str(strain.id)) index = 0 colours = [ "LightSteelBlue", "Moccasin", "DarkSeaGreen", "Khaki", "LightSalmon", "Turquoise", "Thistle" ] for section, ids in sections.items(): # Find MRCA and set bgcolor of its node style = NodeStyle() style["bgcolor"] = colours[index] if len(ids) == 1: node = tree.search_nodes(name=ids[0])[0] else: node = tree.get_common_ancestor(*ids) node.set_style(style) # Grab first node found in this section, and add section label node = tree.search_nodes(name=ids[0])[0] face = faces.TextFace(section, fsize=20) node.add_face(face, column=1, position="aligned") # Wraparound colour scheme index += 1 if index > len(colours) - 1: index = 0
def prune_species_tree(gene_tree, cached_species_tree=None, keep_polytomies=False): gTree = Tree(gene_tree) #species reading #leaf names should be of the type [speciesID_ProteinName] leaf_names = gTree.get_leaf_names() species_list = {x.split('_')[0] for x in leaf_names} species_list = list(species_list) species_ids = {''.join(filter(str.isdigit, x)): x for x in species_list} #big species tree if cached_species_tree: s = cached_species_tree else: s = Tree(EGGNOGv4_SPECIES_TREE) #get lca for core common_ancestor = s.get_common_ancestor(list(species_ids.keys())).copy() #prune to subset # common_ancestor.prune(species_ids) # slower method leaves = {x.name: x for x in common_ancestor.get_leaves()} to_remove = leaves.keys() - species_ids.keys() for species_id in to_remove: if species_id in leaves: leaves[species_id].delete() assert (len(common_ancestor.get_leaf_names()) == len(species_ids)) #binarize if not keep_polytomies: common_ancestor.resolve_polytomy(recursive=True) #change names for leaf in common_ancestor.get_leaves(): leaf.name = species_ids[leaf.name] #write out reconciliation_job species_nw = common_ancestor.write(format=5) return species_nw
def open_tsv_population_size(tree_file, tsv_file): t = Tree(tree_file, format=1) csv = pd.read_csv(tsv_file, header=None, sep='\t') for index, (leaf_1, leaf_2, _, ne, _) in csv.iterrows(): if leaf_1 == leaf_2: leaves = t.get_leaves_by_name(leaf_1) assert (len(leaves) == 1) n = leaves[0] else: n = t.get_common_ancestor([leaf_1, leaf_2]) n.pop_size = ne pop_size_dict = dict() root_pop_size = float(t.pop_size) pop_size_dict["LogPopulationSize"] = [ np.log(float(n.pop_size) / root_pop_size) for n in t.traverse() ] return pop_size_dict, t
def get_dates(chain_name): label_tree = Tree(f"{chain_name}_sample.labels", format=1) dates = pd.read_csv(f"{chain_name}_sample.dates", sep='\t', index_col=0) # rate_tree = Tree(f"{chain_name}_sample.ratetree",format=1) name2group = { "anammox bacteria": "GCA_001828545.1|GCA_004282745.1", "root": "GCA_000011385.1|GCA_003576915.1", "cyanobacteria": "GCA_000011385.1|GCA_000013205.1", "Nostocales": "GCA_000196515.1|GCA_001548455.1", "pleurocapsales": "GCA_000317575.1|GCA_000317025.1", } c = [] for gname, group in name2group.items(): group = group.split('|') raw_name = '%s' % label_tree.get_common_ancestor(group).name sub_dates = dates.loc[[raw_name], :] sub_dates.index = [gname] c.append(sub_dates) df = pd.concat(c, axis=0) return df
def create_tree_data(treename, df): t = Tree(treename) branch_lengths_s = [] branch_lengths_hs = [] dist = [] ns = [] nhs = [] for index, row in progressbar.progressbar(df.iterrows()): d = 0 x = row["species"] y = row["homology_species"] bl = [] c = 0 mca = t.get_common_ancestor(x, y) node = t & x while node.up != mca: d += node.dist bl.append(node.dist) node = node.up c += 1 ns.append(c) c = 0 branch_lengths_s.append(bl) bl = [] node = t & y while node.up != mca: d += node.dist bl.append(node.dist) node = node.up c += 1 nhs.append(c) branch_lengths_hs.append(bl) dist.append(d) create_branch_length_padding(branch_lengths_s) create_branch_length_padding(branch_lengths_hs) return np.array(branch_lengths_s), np.array( branch_lengths_hs), np.array(dist), np.array(ns), np.array(nhs)
def get_sister_species(species_tree, species, anc): """ Extracts a list of species related to a given species: species branching between `species` and the ancestor `anc`. Args: species_tree (ete3 Tree): ete3 tree object species (str): name of the species anc (str): name of the ancestor Returns: list: species branching between `species` and `anc` """ sp_and_sisters = [species] duplicated_sp = get_species(species_tree, anc) tree = Tree(species_tree, format=1) lca = tree.get_common_ancestor([species] + duplicated_sp) sp_and_sisters += [ i.name for i in lca.get_leaves() if i.name not in [species] + duplicated_sp ] return sp_and_sisters
parser.add_argument( '--verbose', action='store_true', help=('Print information about the outgroup (if any) taxa to standard ' 'error')) args = parser.parse_args() tree = Tree(args.treeFile.read()) if args.outgroupRegex: from re import compile regex = compile(args.outgroupRegex) taxa = [leaf.name for leaf in tree.iter_leaves() if regex.match(leaf.name)] if taxa: ca = tree.get_common_ancestor(taxa) if args.verbose: print('Taxa for outgroup:', taxa, file=sys.stderr) print('Common ancestor:', ca.name, file=sys.stderr) print('Common ancestor is tree:', tree == ca, file=sys.stderr) if len(taxa) == 1: tree.set_outgroup(tree & taxa[0]) else: if ca == tree: tree.set_outgroup(tree.get_midpoint_outgroup()) else: tree.set_outgroup(tree.get_common_ancestor(taxa)) print(tree.get_ascii())
for line in outg: outgroups.append(line.rstrip()) outg.close() target_taxa = [] tt = open(sys.argv[2]) for line in tt: target_taxa.append(line.rstrip()) tt.close() #now read in a collection of trees, calc branch lengths over sample, summarise and print out branch_lengths = defaultdict(list) #key = taxa, value = list of brlens treefile = open(sys.argv[3]) for line in treefile: curr_tree = Tree(line.rstrip()) root_node = curr_tree.get_common_ancestor(outgroups) if curr_tree != root_node: curr_tree.set_outgroup(root_node) print curr_tree #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name') #print bundle #if bundle[0] == False: # continue #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want! reference_node = curr_tree.get_common_ancestor(target_taxa) #if reference_node != curr_tree: # curr_tree.set_outgroup(reference_node) #calc distance from root to each branch of interest for taxon in target_taxa: dist = curr_tree.get_distance(taxon, reference_node) branch_lengths[taxon].append(dist)
C = RectFace(triangle=True, width=size[mode], height=size[mode], bgcolor=colourDict[metadata[display_name]['prefec']], fgcolor='#FFFFFF') else: C = RectFace(triangle=True, width=size[mode], height=size[mode], bgcolor='#939393', fgcolor='#FFFFFF') n.add_face(C, column=1, position=position) tree.render(file_name=sys.argv[1] + '_' + cluster + '.pdf', tree_style=ts, w=width) big_tree = Tree(sys.argv[1]) mode = sys.argv[2] metadata = {} metadata = get_meta_new(metadata, big_tree) colourDict = get_colours(clusters, big_tree, colours) #remove dodgy sample big_tree.search_nodes(name="'EBOV|EMLab-RT|IPDPFHGINSP_GUI_2015_5339||GIN|Conakry|?|MinION_LQ05|2015-04-08'")[0].delete(preserve_branch_length=True) #root the same as the MCC tree ancestor = big_tree.get_common_ancestor("'EBOV|EMLab|EM_079422|KR817187|GIN|Macenta|?||2014-03-27'","'EBOV|EMLab|Gueckedou-C05|KJ660348|GIN|Gueckedou|?||2014-03-19'") big_tree.set_outgroup(ancestor) big_tree.ladderize() ts = TreeStyle() ts.show_leaf_name = False #ts.show_branch_support = True ts.scale = 100000 if mode == 'small': ts.scale = 750000 #add legend for each in list(colourDict.keys()): ts.legend.add_face(CircleFace(radius=size[mode]/2, color=colourDict[each]), column=0) ts.legend.add_face(TextFace(each, ftype="Helvetica", fsize=size[mode]), column=1) ts.legend.add_face(CircleFace(radius=size[mode]/2, color='#F1F1F1'), column=0)
# | | /-L # | \--------| # ---------| \-M # | # | /-B # | /--------| # | | | /-J # | | \--------| # \--------| \-K # | # | /-E # \--------| # \-D # # Each main branch of the tree is independently rooted. node1 = t.get_common_ancestor("A", "H") node2 = t.get_common_ancestor("B", "D") node1.set_outgroup("H") node2.set_outgroup("E") print "Tree after rooting each node independently:" print t # # /-F # | # /--------| /-L # | | /--------| # | | | \-M # | \--------| # /--------| | /-A # | | \--------| # | | \-C
mytree = open('raxml.298.pruned.tre', 'r') for line in mytree: t = Tree(line.strip(), format=1) mycalibs = open('calibrations.tab.txt', 'r') for line in mycalibs: info = line.strip().split('\t') print info sp1 = info[0] sp2 = info[1] thetime = info[2] tempnode = t.get_common_ancestor(sp1, sp2) print sp1, sp2, tempnode tempnode.add_features(calibration=">" + thetime) out = open('conus.tree.calibrationsadded.tre', 'w') out.write("365 7\n") myoutput = t.write(format=9, features=["calibration"]).replace('[&&NHX:calibration=', '').replace(']', '') out.write(myoutput + '\n') out.write('//end of file') #print t.write(format=9,features=["calibration"], outfile = "conus.tree.calibrationsadded.tre") #node1 = t.get_common_ancestor("arcuata", "centurio")
# intree = './trees/iqtree/over20p_bac120.ufboot' otree = './bayesTraits_test/test.trees' intree, otree = sys.argv[1:] root_with = 'GCA_900097105.1,GCA_000020225.1,GCA_000172155.1,GCA_001318295.1,GCA_001613545.1,GCA_000019665.1,GCA_000019965.1,GCA_001746835.1' if __name__ == "__main__": if len(open(intree).read().split('\n')) == 1: t = Tree(intree, format=3) else: multiple_trees = [] for row in open(intree): row = row.strip('\n') multiple_trees.append(Tree(row)) LCA = t.get_common_ancestor(root_with.split(',')) t.set_outgroup(LCA) # TODO: finish it. new_name2old_name = {} _count = 0 for leaf in t.get_leaves(): new_name2old_name[str(_count)] = leaf.name leaf.name = str(_count) _count += 1 # for bayestraits nexus_template = """#NEXUS begin trees; translate {translate_text};
mycursor = cnx.cursor(buffered=True) PfamExtractionStatement = "SELECT UID,Species,NumberOfAssociatedSpecies FROM " + PfamTable mycursor.execute(PfamExtractionStatement) pfamResults = mycursor.fetchall() # Each pfam is then analyzed so an age can be assigned to it. for pfam in pfamResults: speciesList = pfam[1].split(',') UID = pfam[0] numberOfSpecies = pfam[2] # If more than one species exists for a particular pfam, then algorithm [1] is implemented (See: description in # the beginning of this script if numberOfSpecies != 1: # The subtree is pulled starting with the common ancestor of all species in the list as the root node subtree = t.get_common_ancestor(speciesList) # The total distance starts at zero and will be added to as each branch in the subtree is searched totalDistance = 0 # We start at the base node and proceed down a branch of the subtree for node in subtree.iter_descendants("preorder"): # So long as our location is not a leaf, we add the relative age of the node to the total and continue to search the tree if node.is_leaf() == False: totalDistance += node.dist # Once we're located on a leaf, we have searched an entire branch of our subtree and, once we add the age of the leaf, we have acquired the # total age of our subtree (minus the age of the common ancestor) else: totalDistance += node.dist break # The variable maxLength is used to find the node that has the greatest number of children in the tree maxLength = 0 # We then traverse the tree starting from the leaves and working our way toward the root node
class ClusterIdentification(object): def __init__(self): self.PercentileThreshold = {} self.dictSharedReads = {} self.dictClusters = {} self.monoFinalRes = [] self.count = 0 self.SerialNodes = {} self.t = Tree(TreeFile) self.nodesRemoved = [] self.nodecheck = [] ##The Split method identifies the percentile threshold for each sample from the results of PatDistSpectrum.py ##This threshold is determined from user input in the command line def Split(self, infile): Percentiles = { "0": "1", "1": "2", "5": "3", "10": "4", "20": "5", "25": "6", "30": "7", "35": "8", "40": "9", "45": "10", "50": "11", "75": "12", "90": "13", "99": "14", "100": "15", } with open(Spectrum, "r") as file1: for line in file1: if not "Samples" in line: linerep = line.replace(" ", "") if percentile in Percentiles: cutoff = Percentiles[percentile] else: sys.stdout.write( "Please specify the percentile as a number (0,1,5,10,20,25,30,35,40,50,75,90,100)" ) sys.exit(1) linesp = linerep.rstrip("\n").split("\t") nodes = linesp[0] nodesSp = nodes.split("__") if nodesSp[0] == nodesSp[1]: combNode = nodesSp[0] + "__" + nodesSp[1] self.PercentileThreshold[combNode] = linesp[int(cutoff)] return self.PercentileThreshold # Identifies all variants passing the threshold defined in the Split method def variantCollection(self): PatDistSpec = self.Split(Spectrum) with open(PatDistOutput, "r") as file2: for line in file2: linesp = line.split(",") node = linesp[0] nodeshort = node[idStart:idLen] nodedouble = nodeshort + "__" + nodeshort mateshort = linesp[1][idStart:idLen] matedouble = mateshort + "__" + mateshort mate = linesp[1] patdist = linesp[2] support = linesp[3].rstrip("\n") comb = nodeshort + "__" + mateshort comb2 = mateshort + "__" + nodeshort # Store variants that are below the respective pat dist threshold defined in PatDistSpec if nodeshort != mateshort: if float(PatDistSpec[nodedouble]) <= float(PatDistSpec[matedouble]): target = float(PatDistSpec[nodedouble]) else: target = float(PatDistSpec[matedouble]) if float(patdist) <= float(target): if str(supportInput) == "PASS": if not comb2 in self.dictSharedReads: if not comb in self.dictSharedReads: self.dictSharedReads[comb] = [] if not node in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(node) if not mate in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(mate) else: if not node in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(node) if not mate in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(mate) elif not support == "None": if float(supportInput) <= float(support): if not comb2 in self.dictSharedReads: if not comb in self.dictSharedReads: self.dictSharedReads[comb] = [] if not node in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(node) if not mate in self.dictSharedReads[comb]: self.dictSharedReads[comb].append(mate) else: if not node in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(node) if not mate in self.dictSharedReads[comb2]: self.dictSharedReads[comb2].append(mate) ##Identifying potential outliers is optional (-oR flag from the command line ) # Based on the retrieve common ancestor function, it identifies outliers as those which contain < 3 intra-variants associated with a given sample def PhylyOutlierRem(self, n, node1, node2, OutlierFile, idStart, idLen): PhyloOutliers = {} ancestorList = [] ancshort = [] node1short = node1[idStart:idLen] node2short = node2[idStart:idLen] nodecomb = node1short + "__" + node2short nodecombRev = node2short + "__" + node2short if not nodecomb or not nodecombRev in self.monoFinalRes: if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: # Collect all common ancestors for each pair of variants ancestor = self.t.get_common_ancestor(n) for i in ancestor: ancestorList.append(i.name) ancestorShort = i.name[idStart:idLen] if not ancestorShort in PhyloOutliers: PhyloOutliers[ancestorShort] = [] PhyloOutliers[ancestorShort].append(1) # Sum the variants for each sample, if < 3, store variant as outlier for k, v in PhyloOutliers.iteritems(): vsum = sum(v) if vsum < 3: for item in ancestorList: if item[idStart:idLen] == k: if not item in self.nodesRemoved: if node1short in self.SerialNodes: if not node2short in self.SerialNodes[node1short]: ancestorList.remove(item) self.nodesRemoved.append(item) elif node2short in self.SerialNodes: if not node1short in self.SerialNodes[node2short]: ancestorList.remove(item) self.nodesRemoved.append(item) else: ancestorList.remove(item) self.nodesRemoved.append(item) for i in self.nodesRemoved: if not i in self.nodecheck: try: item = self.t.search_nodes(name=item)[0] i.delete() self.nodecheck.append(i) except: pass return ancestorList # Create all combinations of intrahost sample identifiers for each respective sequential sample set # These results are used to assist in PhylyOutlierRem def intraComb(self, infile): with open(IntraFile) as f: for line in f: line = line.rstrip("\n") linesp = line.split(",") length = len(linesp) comb = int(length) for i in linesp: self.SerialNodes[i] = [] for pair in itertools.combinations(linesp, 2): for item in pair: if i != item: if not item in self.SerialNodes[i]: self.SerialNodes[i].append(item) return self.SerialNodes # First step of merging overlapping pairs of connected samples def ClusterKeys(self, values, node1, node2): if node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values: for k, v in self.dictClusters.iteritems(): if node1 in self.dictClusters[k]: self.dictClusters[k].append(node2) if node2 in [x for v in values for x in v if type(v) == list] or node2 in values: if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values: for k, v in self.dictClusters.iteritems(): if node2 in self.dictClusters[k]: self.dictClusters[k].append(node1) if node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if node2 in [x for v in values for x in v if type(v) == list] or node2 in values: for k, v in self.dictClusters.iteritems(): if node1 in self.dictClusters[k]: self.dictClusters[k].append(node2) self.dictClusters[k].append(node1) if node2 in self.dictClusters[k]: self.dictClusters[k].append(node2) self.dictClusters[k].append(node1) if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values: if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values: self.count += 1 if not self.count in self.dictClusters: self.dictClusters[self.count] = [] self.dictClusters[self.count].append(node1) self.dictClusters[self.count].append(node2.rstrip("\n")) # Second step of merging overlapping pairs of connected samples def ClusterKeys2(self, dictClusters): Clustvals = {} sysvers = str(sys.version_info[0]) + "." + str(sys.version_info[1]) if float(sysvers) == 2.7: ##For python 2.7 Clustvals = {k: set(val) for k, val in self.dictClusters.items()} elif float(sysvers) == 2.6: ##For python 2.6 Clustvals = dict((k, val) for (k, val) in self.dictClusters.items()) merged = set() srt = sorted(self.dictClusters.keys()) srt2 = srt[:] for key in srt: for k in srt2: if not k == key: if Clustvals[k].intersection(self.dictClusters[key]) and key not in merged: merged.add(k) self.dictClusters[key] = list(Clustvals[k].union(self.dictClusters[key])) srt2.remove(k) for k in merged: del self.dictClusters[k] try: if len(self.dictClusters) > 0: del self.dictClusters[0] except: pass ValLengths = [] ItemNumber = [] for k, v in self.dictClusters.iteritems(): ValLengths.append(int(len(set(v)))) for i in v: if not i in ItemNumber: ItemNumber.append(i) ValLengths[:] = [] for k, v in self.dictClusters.iteritems(): vset = set(v) v[:] = [] vset = list(vset) self.dictClusters[k] = str(vset) return self.dictClusters # Retrieve common ancestors def CommonAncestor(self, nodes): ancestors = [] ancestor = self.t.get_common_ancestor(nodes) for i in ancestor: ancestors.append(i.name) return ancestors # Identify poly- , para-, and monophyletic pairs of variants def CheckMono(self, ncomb, PhyloVarRemoval, Rejects, monoFinal): monoResult = str( self.t.check_monophyly(values=PhyloVarRemoval, ignore_missing=True, target_attr="name", unrooted=True) ) monoResultSp = monoResult.split(",") mR = monoResultSp[1].replace("'", "").replace(")", "").replace(" ", "") if "monophyletic" in mR: if not ncomb in monoFinal: monoFinal[ncomb] = [] monoFinal[ncomb].append(mR) if not ncomb in self.monoFinalRes: self.monoFinalRes.append(ncomb) return True elif "paraphyletic" in mR: if not ncomb in monoFinal: monoFinal[ncomb] = [] monoFinal[ncomb].append(mR) if not ncomb in self.monoFinalRes: self.monoFinalRes.append(ncomb) elif not ncomb in Rejects: Rejects.append(ncomb) ##Analysis identifies all ancestors to variants passing the required percentile thresholds # Following the removal of outliers, it parses through every combination of these variants to determine whether the pair is polyphyletic or not def variantAnalysis(self): monoFinal = {} self.variantCollection() if outlierFlag == "TRUE": OutlierFile = open(outputPath + TreeShort + "." + percentile + "." + supportInput + ".Outlier.txt", "w") try: self.intraComb(IntraFile) except: pass Rejects = [] for k, v in self.dictSharedReads.iteritems(): x = 0 count = 0 ksp = k.split("__") krev = ksp[1] + "__" + ksp[0] FinalList = [] clusters = self.dictSharedReads[k] for pair in itertools.combinations(clusters, 2): n = list(pair) node1 = n[0] node2 = n[1] if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: nodeList = [] node1short = str(pair)[(idStart + 2) : (idLen + 2)] pairSp = str(pair).split(",") node2short = pairSp[1].replace(" ", "").replace("'", "")[idStart:idLen] nshort = [node1short, node2short] ncomb = node1short + "__" + node2short ncombRev = node2short + "__" + node1short if node1short != node2short: if not ncomb or not ncombRev in self.monoFinalRes: if outlierFlag == "TRUE": PhyloVarRemoval = self.PhylyOutlierRem(n, node1, node2, OutlierFile, idStart, idLen) else: PhyloVarRemoval = self.CommonAncestor(n) for i in PhyloVarRemoval: node = i[idStart:idLen] if not node in nodeList: nodeList.append(node) if not node1 in self.nodesRemoved: if not node2 in self.nodesRemoved: if len(nodeList) == 2: if not ncomb or not ncombRev in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break elif len(nodeList) > 2: monoPos = 0 lengthNode = len(nodeList) flag = 0 nodeCheck = 0 nodeRemoval = [] for i in set(nodeList): if not i in nshort: if not i in self.SerialNodes: nodeRemoval.append(i) flag = 1 if flag == 0: if len(PhyloVarRemoval) > 1: if not ncomb or not ncombRev in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break else: for item in nodeRemoval: nodeRemovalShort = item[idStart:idLen] if not nodeRemovalShort + "__" + node1short in self.dictSharedReads.keys(): if ( not node1short + "__" + nodeRemovalShort in self.dictSharedReads.keys() ): if ( not node2short + "__" + nodeRemovalShort in self.dictSharedReads.keys() ): if ( not nodeRemovalShort + "__" + node2short in self.dictSharedReads.keys() ): for i in PhyloVarRemoval: nodeShort = i[idStart:idLen] if nodeShort in nodeRemoval: PhyloVarRemoval.remove(i) if len(PhyloVarRemoval) > 1: if not ncomb in self.monoFinalRes: if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal): break flag = 0 if outlierFlag == "TRUE": for i in set(self.nodesRemoved): OutlierFile.write("%s\n" % i) self.dictClusters = {} self.count = 0 for i in self.monoFinalRes: if not "polyphyletic" in i: if not self.count in self.dictClusters: self.dictClusters[self.count] = [] values = self.dictClusters.values() isp = i.split("__") node1 = isp[0] node2 = isp[1].split("\t")[0] ClusterKeys = self.ClusterKeys(values, node1, node2) try: FinalClustering = self.ClusterKeys2(ClusterKeys) except: print "WARNING: Patristic Distance Data files may be empty..." sys.exit(1) for k, v in monoFinal.iteritems(): print k + "\t" + str(v) print "Clusters that are polyphyletic: " + str(Rejects) return FinalClustering
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."): """ build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS PHYLIP manual http://evolution.genetics.washington.edu/phylip/doc/ """ phy_file = op.join(work_dir, "work", "aln.phy") try: AlignIO.write(alignment, file(phy_file, "w"), "phylip") except ValueError: print("Repeated seq name, possibly due to truncation. NJ tree not built.", file=sys.stderr) return None seqboot_out = phy_file.rsplit(".",1)[0] + ".fseqboot" seqboot_cl = FSeqBootCommandline(FPHYLIP_BIN("fseqboot"), \ sequence=phy_file, outfile=seqboot_out, \ seqtype="d", reps=100, seed=12345) stdout, stderr = seqboot_cl() logging.debug("Resampling alignment: %s" % seqboot_cl) dnadist_out = phy_file.rsplit(".",1)[0] + ".fdnadist" dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \ sequence=seqboot_out, outfile=dnadist_out, method="f") stdout, stderr = dnadist_cl() logging.debug\ ("Calculating distance for bootstrapped alignments: %s" % dnadist_cl) neighbor_out = phy_file.rsplit(".",1)[0] + ".njtree" e = phy_file.rsplit(".",1)[0] + ".fneighbor" neighbor_cl = FNeighborCommandline(FPHYLIP_BIN("fneighbor"), \ datafile=dnadist_out, outfile=e, outtreefile=neighbor_out) stdout, stderr = neighbor_cl() logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl) consense_out = phy_file.rsplit(".",1)[0] + ".consensustree.nodesupport" e = phy_file.rsplit(".",1)[0] + ".fconsense" consense_cl = FConsenseCommandline(FPHYLIP_BIN("fconsense"), \ intreefile=neighbor_out, outfile=e, outtreefile=consense_out) stdout, stderr = consense_cl() logging.debug("Building consensus tree: %s" % consense_cl) # distance without bootstrapping dnadist_out0 = phy_file.rsplit(".",1)[0] + ".fdnadist0" dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \ sequence=phy_file, outfile=dnadist_out0, method="f") stdout, stderr = dnadist_cl0() logging.debug\ ("Calculating distance for original alignment: %s" % dnadist_cl0) # infer branch length on consensus tree consensustree1 = phy_file.rsplit(".",1)[0] + ".consensustree.branchlength" run_ffitch(distfile=dnadist_out0, outtreefile=consensustree1, \ intreefile=consense_out) # write final tree ct_s = Tree(consense_out) if outgroup: t1 = consensustree1 + ".rooted" t2 = smart_reroot(consensustree1, outgroup, t1) if t2 == t1: outfile = outfile.replace(".unrooted", "") ct_b = Tree(t2) else: ct_b = Tree(consensustree1) nodesupport = {} for node in ct_s.traverse("postorder"): node_children = tuple(sorted([f.name for f in node])) if len(node_children) > 1: nodesupport[node_children] = node.dist/100. for k,v in nodesupport.items(): ct_b.get_common_ancestor(*k).support = v print(ct_b) ct_b.write(format=0, outfile=outfile) try: s = op.getsize(outfile) except OSError: s = 0 if s: logging.debug("NJ tree printed to %s" % outfile) return outfile, phy_file else: logging.debug("Something was wrong. NJ tree was not built.") return None
if len(node) > biggest_other_node: biggest_other_node = len(node) tree.set_outgroup(node) #test the various phylogenetic criteria for LGT. print "Tree\tResult\tEuksInTree\tSupportEukMonophyly\tEuksInTargetGroup\tDistanceToClosestEukClade\tSupergroupsInTargetGroup" #euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate print sys.argv[1] + "\tSingleton\t1\tN/A\tN/A\tN/A\t1" #euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree #print len(eukaryote_seqs) else: try: answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name") if answer[0] == True: ca = tree.get_common_ancestor(eukaryote_seqs) target_group_sgs = {} for leaf in ca: if leaf.name in group_assignments: leaf_supergroup = group_assignments[leaf.name] if leaf_supergroup in euk_supergroups: target_group_sgs[leaf_supergroup] = 1 else: print "Warning: a sequence in this tree doesn't have a supergroup assignment: " + str(leaf.name) num_sgs = len(target_group_sgs.keys()) print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) + "\tN/A\tN/A\t" + str(num_sgs) elif answer[0] == False: mono_groups = [] target_group = '' for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"): for leaf in node:
import random from ete3 import Tree # Creates a normal tree t = Tree( '((H:0.3,I:0.1):0.5, A:1, (B:0.4,(C:0.5,(J:1.3, (F:1.2, D:0.1):0.5):0.5):0.5):0.5);' ) print t # Let's locate some nodes using the get common ancestor method ancestor=t.get_common_ancestor("J", "F", "C") # the search_nodes method (I take only the first match ) A = t.search_nodes(name="A")[0] # and using the shorcut to finding nodes by name C= t&"C" H= t&"H" I= t&"I" # Let's now add some custom features to our nodes. add_features can be # used to add many features at the same time. C.add_features(vowel=False, confidence=1.0) A.add_features(vowel=True, confidence=0.5) ancestor.add_features(nodetype="internal") # Or, using the oneliner notation (t&"H").add_features(vowel=False, confidence=0.2) # But we can automatize this. (note that i will overwrite the previous # values) for leaf in t.traverse(): if leaf.name in "AEIOU": leaf.add_features(vowel=True, confidence=random.random()) else: leaf.add_features(vowel=False, confidence=random.random()) # Now we use these information to analyze the tree. print "This tree has", len(t.search_nodes(vowel=True)), "vowel nodes" print "Which are", [leaf.name for leaf in t.iter_leaves() if leaf.vowel==True] # But features may refer to any kind of data, not only simple
def caluclate_rootstrap(treeFile, bootFile, is_rooted, out_group): ''' Parameters ---------- treeFile: rooted tree in newick format (.treefile in IQ-TREE) bootFile: rooted bootstrap trees in newick format (e.g. .ufboot file in IQ-TREE) rooted: if the bootstrap trees are rooted (defult is True). If not rooted provide outgroup taxa file og: A file with outgroup taxa in Nexus format Returns ------- rootstrapTree: rooted tree with rootstrap support values as branch lengths in newick format ''' boottrees = [] trees = [] polyphyly = 0 N_boottrees = 0 if not is_rooted: if out_group == None: raise SystemExit('Error: Please provide outgroup taxa in Nexus format') ML_tree = Tree(treeFile) try: og = Read_Nex(out_group) #get the outgroup taxa except: raise SystemExit('Error: Cannot find outgroup taxa') if len(og) == 1: #if there is one outgroup taxon use it to root the tree ML_root = ML_tree.search_nodes(name=og[0])[0] else: #if there are more than one outgroup taxon find their common ancestor ML_root = ML_tree.get_common_ancestor(og) if not ML_root.is_root(): ML_tree.set_outgroup(ML_root) ingroup = [n.name for n in ML_tree.get_leaves() if n.name not in og] try:#check if the ingroup is monophyletic if ML_tree.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]: ML_tree.prune(ingroup) #prune ingroup taxa only rootedMLtree = os.path.splitext(treeFile)[0]+'_rooted.treefile' ML_tree.write(outfile=rootedMLtree) #write the rooted ML tree with ingroup taxa only to a file else: raise SystemExit('Error: ML ingroup taxa are not monophyletic') except: raise SystemExit('Error: ML ingroup taxa are not monophyletic') with open(bootFile, 'r') as f: for tree in f: N_boottrees += 1 t = Tree(tree) ingroup = [n.name for n in t.get_leaves() if n.name not in og] if len(og) == 1: #if there is one outgroup taxon use it to root the tree root = t.search_nodes(name=og[0])[0] elif len(og) > 1: #if there are more than one outgroup taxon find their common ancestor root = t.get_common_ancestor(og) else: #if there is no outgroup taxa raise an error raise SystemExit('Error: Please provide outgroup taxa in Nexus format') if not root.is_root(): t.set_outgroup(root) try:#check if the ingroup is monophyletic if t.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]: trees.append(t.write(format=9)) else: polyphyly += 1 except: polyphyly += 1 for tree in trees: t = Tree(tree) t.prune(ingroup) boottrees.append(t.write(format=9)) else: #If you are using rooted ML tree and rooted bootstrap trees (e.g. NR model) ML_tree = Tree(treeFile) with open(bootFile, 'r') as f: for tree in f: N_boottrees += 1 t = Tree(tree) boottrees.append(t.write(format=9)) booted = [(g[0], len(list(g[1]))) for g in ite.groupby(boottrees)] #a list of all unique bootstrap trees with thier number of occurrence boottrees = [] for b in booted: t2 = Tree(b[0]) x = [] for n in t2.traverse(): if n.is_root(): for child in n.children: if child.is_leaf(): x.append([child.name]) else: x.append([i.name for i in child.get_descendants()]) boottrees.append([b[1],x]) if is_rooted: roots = all_possible_roots(treeFile) else: roots = all_possible_roots(rootedMLtree) rootstrap_value = dict.fromkeys(roots.keys(), 0) for node, rooted in roots.items(): t1 = Tree(rooted) x = [] for n in t1.traverse(): if n.is_root(): for child in n.children: if child.is_leaf(): x.append([child.name]) else: x.append([i.name for i in child.get_descendants()]) y = [set(i) for i in x] for split in boottrees: z = [set(i) for i in split[1]] if len(y) == len(z): for group in y: if group in z: z.remove(group) if len(z) == 0: rootstrap_value[node] += split[0]/N_boottrees else: rootstrap_value[node] += 0 if is_rooted: t = Tree(treeFile) else: t = Tree(rootedMLtree) k = 1 for n in t.traverse(): if not n.is_root(): if not n.is_leaf(): n.add_features(name='n'+str(k)) n.add_features(rootstrap=rootstrap_value[n.name]*100) k += 1 else: n.add_features(rootstrap=rootstrap_value[n.name]*100) temp = os.path.splitext(treeFile)[0]+'.temp' rootstrapTree = os.path.splitext(treeFile)[0]+'.rootstrap' t.write(outfile=temp, features =["rootstrap"]) x = dendropy.Tree.get(path=temp, schema='newick') x.write(path=rootstrapTree, schema='nexus') os.remove(temp) return polyphyly
def reconcile_trees(higher_level, input_trees, computation_method, reconciliation_software, cpu_cores, keep_polytomies, root_notung, infer_transfers, output_reconciliations): reconciliation.NOTUNG_WEB = reconciliation_software tree_computations = read_trees(input_trees) eggNOG_level_species = eu.read_level_species() # species tree generation eggNOG_speciesTree = Tree(reconciliation.EGGNOGv4_SPECIES_TREE) # do intial pruning to exclude all non-euNOG species higher_node = eggNOG_speciesTree.get_common_ancestor( [str(x) for x in eggNOG_level_species[higher_level]]) eggNOG_speciesTree = higher_node.detach() sys.stderr.write('Generating species trees reconciliation...\n') if cpu_cores > 1: cached_jobs = [(x, y, z, eggNOG_speciesTree, keep_polytomies, root_notung) for x, y, z in tree_computations] with Pool(cpu_cores) as p: reconciliation_jobs = list( tqdm(p.imap(reconciliation.prepare_reconciliation_job, cached_jobs), total=len(cached_jobs))) else: reconciliation_jobs = [] for nog_id, sample_no, tree_nw in tqdm(tree_computations): species_nw = reconciliation.prune_species_tree( tree_nw, eggNOG_speciesTree, keep_polytomies) job = (nog_id, sample_no, tree_nw, species_nw, root_notung) reconciliation_jobs.append(job) sys.stderr.write('Starting reconciliation for %d jobs...\n' % len(reconciliation_jobs)) reconciliation_method = reconciliation.reconcile # reconciliation if computation_method == 'cluster': reconciliation.submit_taskArray(higher_level, reconciliation_jobs, keep_polytomies, root_notung, infer_transfers) reconciliations = reconciliation.collect_taskArray(higher_level, cpu_cores=cpu_cores) elif cpu_cores > 1: with Pool(cpu_cores) as p: reconciliations = list( tqdm(p.imap(reconciliation_method, reconciliation_jobs), total=len(reconciliation_jobs))) else: reconciliations = [ reconciliation_method(x) for x in tqdm(reconciliation_jobs) ] # flag incomplete reconciliations to_delete = [] for i in range(len(reconciliations)): if reconciliations[i] is None: to_delete.append(i) else: nog_id, result = reconciliations[i] if not result: # reconciliation failed reconciliations[i] = (nog_id, ('S', -5.0)) # delete incomplete if to_delete: for i in sorted(to_delete, reverse=True): sys.stderr.write( 'Deleting reconciliation entry %d.%d because empty\n' % (higher_level, i)) del reconciliations[i] write_reconciliations(output_reconciliations, reconciliations)
print t # /-A # | # | /-H #---------|---------| # | \-F # | # | /-B # \--------| # | /-E # \--------| # \-D # # Let's define that the ancestor of E and D as the tree outgroup. Of # course, the definition of an outgroup will depend on user criteria. ancestor = t.get_common_ancestor("E","D") t.set_outgroup(ancestor) print "Tree rooteda at E and D's ancestor is more basal that the others." print t # # /-B # /--------| # | | /-A # | \--------| # | | /-H #---------| \--------| # | \-F # | # | /-E # \--------| # \-D
parser = argparse.ArgumentParser(description='系統樹と表の並び替え') parser.add_argument('-n', '--newick', help='newick file') parser.add_argument('-t', '--table', help='table file, sep = tab, first line index') parser.add_argument('-o1', '--outgroup1', help='set outgroup1') parser.add_argument('-o2', '--outgroup2', help='set outgroup2') args = parser.parse_args() NEWICK = args.newick g_genome = args.table OUTG1 = args.outgroup1 OUTG2 = args.outgroup2 # 系統樹の読み込み t = Tree(NEWICK , format= 0) ancestor = t.get_common_ancestor(OUTG1,OUTG2) t.set_outgroup( ancestor ) ts = TreeStyle() ts.show_leaf_name = True ts.show_branch_support = True t.render(NEWICK+".png", w=600, units="mm",tree_style=ts) t.write(format=0, outfile=NEWICK+".newick") # ゲノム情報の読み込み info = pd.read_table(g_genome, sep='\t', index_col=0) frame = pd.DataFrame(info) # テーブルの並び替え strain_list = t.get_leaf_names()
tree = Tree( '((H:1,I:1):0.5, A:1, (B:1,(C:1,D:1):0.5):0.5);' ) print "this is the original tree:" print tree # /-H # /--------| # | \-I # | #---------|--A # | # | /-B # \--------| # | /-C # \--------| # \-D # Finds the first common ancestor between B and C. ancestor = tree.get_common_ancestor("D", "C") print "The ancestor of C and D is:" print ancestor # /-C #---------| # \-D # You can use more than two nodes in the search ancestor = tree.get_common_ancestor("B", "C", "D") print "The ancestor of B, C and D is:" print ancestor # /-B #---------| # | /-C # \--------| # \-D # Finds the first sister branch of the ancestor node. Because
target_leaf = leaf else: leaf.add_features(domain="Other") #print eukaryote_seqs #test the various phylogenetic criteria for LGT. #euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate print sys.argv[1] + "\tSingleton" #euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree #print len(eukaryote_seqs) else: try: answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name") if answer[0] == True: ca = tree.get_common_ancestor(eukaryote_seqs) print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) elif answer[0] == False: mono_groups = [] target_group = '' for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"): if target_leaf in node: target_group = node else: mono_groups.append(node) size_target_group = len(target_group) #get distance shortest_distance = 999999999999999.0 closest_other_group = '' for subtree in mono_groups: curr_distance = tree.get_distance(target_group, subtree, topology_only=True)
line = line.rstrip() (geneFamily, nodeRb, iesColumn, presence) = line.split() # nodes are in rb format # find what type of speciation event nodeRb corresponds to if float(presence) > cutoff: spe = d[geneFamily][nodeRb] if spe: homies.setdefault((geneFamily,iesColumn), []).append(spe) # load species tree t = Tree(os.path.join(basePath, 'analysis/', 'phyldogT' + asrRun, 'results', 'OutputSpeciesTree_ConsensusNumbered.tree'), format = 1) # replace species names with speciation events and add 0 to root node label for l in t.traverse(): if(l.is_leaf()): l.name = (re.sub(r'.+_.+_(\d+)', r'\1', l.name)) elif(l.is_root()): if(l.name): quit(l.name) # is root named? else: l.name = '0' for (geneFamily, iesColumn) in homies: L = homies[(geneFamily, iesColumn)] if(len(L) == 1): # if only one node print('\t'.join([geneFamily, iesColumn, L[0]])) else: ancestor = t.get_common_ancestor(L) print('\t'.join([geneFamily, iesColumn, ancestor.name]))
# load species tree t = Tree( '/home/dsellis/data/IES/analysis/phyldog/results/OutputSpeciesTree_ConsensusNumbered.tree', format=1) # replace species names with speciation events and add 0 to root node label for l in t.traverse(): if (l.is_leaf()): l.name = (re.sub(r'.+_.+_(\d+)', r'\1', l.name)) elif (l.is_root()): if (l.name): quit(l.name) # is root named? else: l.name = '0' # read line by line firstIES.dat f = open('/home/dsellis/data/IES/analysis/tables/firstIES.dat', 'r') header = f.readline() print('\t'.join(['cluster', 'iesColumn', 'spEvent'])) for line in f: line = line.rstrip() L = line.split() cluster = L.pop(0) iesColumn = L.pop(0) if (len(L) == 1): # if only one node print('\t'.join([cluster, iesColumn, L[0]])) else: ancestor = t.get_common_ancestor(L) print('\t'.join([cluster, iesColumn, ancestor.name]))
tree.render(file_name=sys.argv[1] + "_" + cluster + ".pdf", tree_style=ts, w=width) big_tree = Tree(sys.argv[1]) mode = sys.argv[2] metadata = {} metadata = get_meta_new(metadata, big_tree) colourDict = get_colours(clusters, big_tree, colours) # remove dodgy sample big_tree.search_nodes(name="'EBOV|EMLab-RT|IPDPFHGINSP_GUI_2015_5339||GIN|Conakry|?|MinION_LQ05|2015-04-08'")[0].delete( preserve_branch_length=True ) # root the same as the MCC tree ancestor = big_tree.get_common_ancestor( "'EBOV|EMLab|EM_079422|KR817187|GIN|Macenta|?||2014-03-27'", "'EBOV|EMLab|Gueckedou-C05|KJ660348|GIN|Gueckedou|?||2014-03-19'", ) big_tree.set_outgroup(ancestor) big_tree.ladderize() ts = TreeStyle() ts.show_leaf_name = False # ts.show_branch_support = True ts.scale = 100000 if mode == "small": ts.scale = 750000 # add legend for each in colourDict.keys(): ts.legend.add_face(CircleFace(radius=size[mode] / 2, color=colourDict[each]), column=0) ts.legend.add_face(TextFace(each, ftype="Helvetica", fsize=size[mode]), column=1)
"grnPRASc_MMETSP0941_Gene.14464-Transcript_5625_Chlorophyta_Prasinococcales"], "Clade2": ["crypGONIp_MMETSP0107_Gene.30083-Transcript_20766_Cryptophyta_Cryptomonadales", "Phenylobacterium_sp._RIFCSPHIGHO2_01_FULL_69_31_OHB27812.1"] } #or make them available in a tsv: with open('_ancestors.tsv', 'r') as f: reader = csv.reader(f, delimiter='\t') try: ancestors_d.update({r[0]: [r[1], r[2]] for r in reader}) except IndexError: #incomplete line print("Incomplete processing of ancestor lineages!") print(ancestors_d) try: ancestor = t.get_common_ancestor(ancestors_d[filename]) #print(ancestor) t.set_outgroup(ancestor) except KeyError: print("Root not selected!") print(t.get_tree_root()) quit() t.ladderize(direction=1) #select scale 0-1.0 or 0-100 for support values supportscache = t.get_cached_content(store_attr="support") supportslist = [x.support for x in supportscache] if max(supportslist) == 1: minsupport = 0.85 else: minsupport = 85
print t # /-A # | # | /-H #---------|---------| # | \-F # | # | /-B # \--------| # | /-E # \--------| # \-D # # Let's define that the ancestor of E and D as the tree outgroup. Of # course, the definition of an outgroup will depend on user criteria. ancestor = t.get_common_ancestor("E", "D") t.set_outgroup(ancestor) print "Tree rooteda at E and D's ancestor is more basal that the others." print t # # /-B # /--------| # | | /-A # | \--------| # | | /-H #---------| \--------| # | \-F # | # | /-E # \--------| # \-D
tree = Tree(args.tree, format=1) nwk = Tree(args.nwk, format=1) assert (len(tree) == len(nwk)) tree_leaf_names = set(tree.get_leaf_names()) for leaf in nwk: if leaf.name not in tree_leaf_names: try: leaf.name = next(n for n in tree_leaf_names if leaf.name.split("_")[0] == n.split("_")[0]) except StopIteration: leaf.name = next(n for n in tree_leaf_names if leaf.name.split("_")[1] == n.split("_")[1]) assert (set(nwk.get_leaf_names()) == tree_leaf_names) root_age = nwk.get_closest_leaf()[1] df = [["Root", root_age, root_age, root_age]] for n in nwk.iter_descendants(strategy='postorder'): if not n.is_leaf(): name = tree.get_common_ancestor(n.get_leaf_names()).name if n.dist <= 0: print("Node " + name + " is attached to it's parent, thus it's discarded.") continue age = n.get_closest_leaf()[1] df += [[name, age, age, age]] header = ["NodeName", "Age", "LowerBound", "UpperBound"] pd.DataFrame(df).to_csv(args.nwk.replace(".nwk", ".tsv"), index=False, header=header, sep="\t")
# | | /-L # | \--------| #---------| \-M # | # | /-B # | /--------| # | | | /-J # | | \--------| # \--------| \-K # | # | /-E # \--------| # \-D # # Each main branch of the tree is independently rooted. node1 = t.get_common_ancestor("A", "H") node2 = t.get_common_ancestor("B", "D") node1.set_outgroup("H") node2.set_outgroup("E") print "Tree after rooting each node independently:" print t # # /-F # | # /--------| /-L # | | /--------| # | | | \-M # | \--------| # /--------| | /-A # | | \--------| # | | \-C
for main_node in main_tree.traverse(strategy="levelorder"): if main_node.is_leaf(): continue new_support = 0 # Get all leaf names from the main tree clade_leaf_names = main_tree.get_leaf_names() # Now check for each bs_tree if the common ancestor of these same leaves have more leaves for bs_tree in bootstrap_trees: # Get all node objects for all the leaves by name clade_leaf_nodes_in_bs_tree = [ bs_tree.get_leaves_by_name(leaf_name)[0] for leaf_name in clade_leaf_names ] # Get common ancestor in bs_tree common_ancestor_node = bs_tree.get_common_ancestor( clade_leaf_nodes_in_bs_tree) # Get leafnames of the common ancestor node and check if they are the same bs_tree_clade_leaf_names = common_ancestor_node.get_leaf_names() if set(clade_leaf_names) == set(bs_tree_clade_leaf_names): # Clades match! new_support = new_support + 1 new_support = new_support / len(bootstrap_trees) * 100 logger.debug("Support for internal node was {}, now is {}".format( main_node.support, new_support)) main_node.support = new_support # Output the new tree logger.info("Writing new bootstrapped tree to STDOUT.") print(main_tree.write())
print tree archaea = [] #make a list of archaea that are in the tree bacteria = [] #check the domain of each taxon in the tree for taxon in tree: print taxon.name + "\t" + id_to_domain[taxon.name] if id_to_domain[taxon.name] == 'Archaea': archaea.append(taxon.name) else: bacteria.append(taxon.name) #first, check if archaea are monophyletic in the tree if tree.check_monophyly(values=archaea, target_attr="name")[0] == True: #find the branch separating archaea and bacteria, and reroot the tree on that archaea_ancestor = tree.get_common_ancestor(archaea) tree.set_outgroup(archaea_ancestor) elif tree.check_monophyly(values=bacteria, target_attr="name")[0] == True: bacteria_ancestor = tree.get_common_ancestor(bacteria) tree.set_outgroup(bacteria_ancestor) else: #neither archaea nor bacteria were monophyletic, so print some error and quit print sys.argv[1] + ": neither A nor B monophyletic." quit() outfile_name = sys.argv[1] + "_rerooted" tree.write(outfile=outfile_name)