def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1): # TODO: add checkpoint to avoid repeating to_phy = congen for name, data in ndf.groupby('saccver'): # mi = data.sstart.min() # ma = data.send.max() tx = data.staxid.iloc[0] try: seq = refs['>%s' % name].replace('\n', '').strip() # [mi-1:ma+1] except KeyError: name = name.split('|')[0] seq = refs['>%s' % name].replace('\n', '').strip() to_phy += '>%d.%s\n%s\n' % (tx, name, seq) with shelve.open(fasta) as dic: for h, s in dic.items(): if h.strip()[1:] in targetids: print(h) to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', '')) else: print(h, 'not in') aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy) trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps) tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm) tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';' t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0]) with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \ as tp: al.write(trm) t.write(outfile='%s.tree' % pref) dill.dump(t, tp) tax2 = t.annotate_ncbi_taxa() fix_species(t) print(t) return t, tax2
def run(args): from ete3 import Tree, PhyloTree for nw in args.src_tree_iterator: if args.orthologs is not None: t = PhyloTree(nw) for e in t.get_descendant_evol_events(): print(e.in_seqs, e.out_seqs)
def LoadTrees(treeFile, dlm): """Reads and stores phylogenetic trees from a file Parameters ------ treefile: file, file of newick trees, 1 per line outgroup: str, last entry from quartet Returns ------ treelist: obj, ete3 object of trees """ print("loading trees...") treelist = [] pbar = tqdm(total=file_len(treeFile)) with open(treeFile, 'r') as newick: for line in newick: pbar.update(1) if not line.startswith("NA"): t = PhyloTree(line) t.set_species_naming_function( lambda node: node.name.split(dlm)[0]) treelist.append(t) pbar.close() return (treelist)
def reconcile_etetoolkit(protein): species_tree = PhyloTree(SPECIES_TREE_FILE.format( protein, 'nh'), format=1, sp_naming_function=lambda name: name) gene_tree = PhyloTree(GENE_TREE_FILE.format( protein, protein, 'nh'), format=1, sp_naming_function=lambda name: name) recon_tree, events = gene_tree.reconcile(species_tree) recon_tree.render("phylotree.png")
def cut_stray_other(gene, species_keep, species_list): ######Showing the tree###### clade_tree=PhyloTree(gene+"/"+gene+".3.fa.tre") clade_tree.prune(species_keep,preserve_branch_length=True) if len(species_keep)>1: view_rooted_tree(clade_tree) print("\nThis is the clade tree. There are "+str(len(species_keep))+" total gene copies.\n") else: print("\nSpecies tree only contains 1 species. Tree will not be shown.") cut_list=species_keep view_counts(cut_list, species_list) ######Removing stray within-clade gene copies from the clade###### cut_question=raw_input("\nAre there stray genes to cut? (y/n)") while cut_question[0]== "y": choice4=raw_input("\nIf this group is a monophyletic clade, type c.\nOtherwise, type n.") if choice4[0]=="c": cut_gene_list=choose_clade(clade_tree) else: cut_gene_str=raw_input("\nEnter genes to cut, separated by a space: ") cut_gene_list=[item for item in cut_gene_str.split()] cut_list=[i for i in cut_list if i not in cut_gene_list] if set(cut_gene_list).issubset(species_keep): try: clade_tree.prune(cut_list,preserve_branch_length=True) view_rooted_tree(clade_tree) view_counts(cut_list, species_list) except ValueError: print ("\nSomething is wrong with the way the genes were entered. You entered:\n"+cut_gene_str+"\nCut abandoned.") else: print ("\nAt least one gene is not found on the tree. You entered:\n"+cut_gene_str+"\nCut abandoned.") cut_question=raw_input("\nAre there more genes to cut? (y/n)") return (cut_list)
def define_groups(gene, cut_list, species_list, species_keep, clade_name): clade_tree=PhyloTree(gene+"/"+gene+".3.fa.tre") clade_tree.prune(cut_list,preserve_branch_length=True) n=1 ######Designating whole clade duplications###### choice=raw_input("\nWould you like to make a group? (y/n)") while choice[0] == "y": choice4=raw_input("\nIf this group includes all the genes left on the tree, type a.\nIf this group is a monophyletic clade, type c.\nOtherwise, type n.") if choice4[0]=="a": group_list=cut_list elif choice4[0]=="c": group_list=choose_clade(clade_tree) else: group_str=raw_input("\nEnter genes for the group, separated by a space: ") group_list=[item for item in group_str.split()] ######Checking that there is only one gene per species###### group_list2,subclade_name=check_single_group(group_list) ######Checking for typos###### for i in group_list2: if set(i).issubset(species_keep): ######Allow a chance to back out, for example if user forgot to enter spaces###### print("\nThere are "+str(len(i))+" genes in this group.\nGroup looks like:") print (i) choice3=raw_input("\nMake the group? (y/n)") else: print ("\nAt least one gene is not found on the tree. You entered:\n") print (i) choice3=raw_input("\nEnter n to abandon this list and start again.") if choice3[0]=="y": ######Saving group as file and add group to master list###### if subclade_name=="ynyn": clade_filename="{}_{}".format(clade_name, n) saving_group(gene, i, clade_filename) else: clade_filename="{}_{}_{}".format(clade_name, subclade_name, n) saving_group(gene, i, clade_filename) n=n+1 else: print("\nGroup abandoned.") choice=raw_input("\nWould you like to make a group for this clade? (y/n)") cut_list=[j for j in cut_list if j not in i] ######Checking to see if the tree is empty###### if len(cut_list) == 0: print("\nThe tree is now empty. We will continue with the next clade.") choice="n" ######Preparing for next group###### else: choice2=raw_input("\nWould you like to view the tree with the group removed? (y/n)") if choice2[0] == "y": clade_tree.prune(cut_list,preserve_branch_length=True) view_rooted_tree(clade_tree) view_counts(cut_list, species_list) choice=raw_input("\nWould you like to make another group for this clade? (y/n)") else: print("\nGroup abandoned.") choice=raw_input("\nWould you like to make a group for this clade? (y/n)")
def get_example_tree(): # Performs a tree reconciliation analysis gene_tree_nw = '/home/issa/Documents/stage/raxml/clusters_Trimal/bestTree/RAxML_bestTree.cluster_9.fasta.aln' species_tree_nw = '/home/issa/Documents/stage/raxml/specie_tree_Trimal/RAxML_bestTree.specie_TREE_trimal.tree' genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) recon_tree, events = genetree.reconcile(sptree) recon_tree.link_to_alignment(alg) return recon_tree, TreeStyle()
def get_example_tree(): # Performs a tree reconciliation analysis gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) recon_tree, events = genetree.reconcile(sptree) recon_tree.link_to_alignment(alg) return recon_tree, TreeStyle()
def main(): parser = argparse.ArgumentParser(description='Gene Copy Number Finder') parser.add_argument('--genetree', required=True, help='GeneTree in nhx format') parser.add_argument('--speciesorder', required=True, help='Comma-separated species list') args = parser.parse_args() species_list = args.speciesorder.split(",") species_list = [_.strip() for _ in species_list] table = [] with open(args.genetree, "r") as f: # reads multiple gene tree line by line gene tree for line in f: # Remove empty NHX features that can be produced by TreeBest but break ete3 line = line.replace('[&&NHX]', '') # reads single gene tree genetree = PhyloTree(line) leaves = genetree.get_leaf_names() leaves_parts = [_.split("_") for _ in leaves] for i, leaf_parts in enumerate(leaves_parts): if len(leaf_parts) != 2: raise Exception( "Leaf node '%s' is not in gene_species format" % leaves[i]) leaves_species = [_[1] for _ in leaves_parts] species_counter = collections.Counter(leaves_species) # Assign to ref_species the first element of species_list which # appears in a leaf node for ref_species in species_list: if ref_species in species_counter: break else: raise Exception( "None of the specified species was found in the GeneTree '%s'" % line) # Find the gene of the (first) leaf node for the ref_species for leaf_parts in leaves_parts: if leaf_parts[1] == ref_species: species_counter['gene'] = leaf_parts[0] break table.append(species_counter) colList = ["gene"] + species_list printTSV(table, colList)
def test_lineages(self): """ Search trees (naming format: NumericTaxid.SequenceName) for nodes containing branches that separate two groups of primate genes where, in one side, the human gene has been lost, and the branch support value of the matching node is higher than 0.9. /-Any primate taxid (9443 in lineage) support >= 0.9--| \-Any primate taxid except human """ t1 = PhyloTree("(9601.ENSPPYP00000022176:1,9593.ENSGGOP00000009720:1);") t2 = PhyloTree("(9361.ENSDNOP00000016844:1,9258.ENSOANP00000032529:1);") t3 = PhyloTree( "(((((37347.ENSTBEP00000010698:0.120098,(9361.ENSDNOP00000000113:0.0697238,(9785.ENSLAFP00000009564:0.0297499,(9371.ENSETEP00000002412:0.0588324,9813.ENSPCAP00000006440:0.026638)0.985184:0.0242194)0.99985:0.0211882)0.99706:0.0161759)0.756:0.00666819,((132908.ENSPVAP00000002358:0.0439546,59463.ENSMLUP00000004598:0.0635161)0.994843:0.00885432,(9796.ENSECAP00000009809:0.0292517,((9685.ENSFCAP00000004938:0.056779,(9615.ENSCAFP00000008559:0.039179,(9823.ENSSSCP00000024070:0.126803,(9669.ENSMPUP00000010096:0.0341928,9646.ENSAMEP00000005906:0.0189746)0.995231:0.00951966)0.915476:0.0046099)0.949664:0.00417374)0.99985:0.0133593,(9739.ENSTTRP00000009464:0.0664336,9913.ENSBTAP00000001687:0.036632)0.99985:0.0236174)0.939309:0.00508062)0.991475:0.00823937)0.99985:0.0107263)0.99985:0.0100107,((9986.ENSOCUP00000014919:0.0830612,10141.ENSCPOP00000005291:0.12195)0.99985:0.0202639,((9483.ENSCJAP00000047968:0.0446865,(9544.ENSMMUP00000007168:0.0201746,((9593.ENSGGOP00000005929:0.00916494,(9606.ENSP00000294053:1.3e-07,9598.ENSPTRP00000006940:0.0068176)0.955193:0.00220905)0.99985:0.00778854,(9601.ENSPPYP00000004174:0.00495163,61853.ENSNLEP00000020892:0.179569)0.290072:0.00153447)0.998732:0.00889714)0.99985:0.0144864)0.99985:0.0344562,(9478.ENSTSYP00000006073:0.129349,(30608.ENSMICP00000010690:0.0852248,30611.ENSOGAP00000013738:0.0467206)0.99985:0.0188861)0.232709:0.00179852)0.99985:0.00929928)0.51042:0.00516905)0.367617:0.00813494,(43179.ENSSTOP00000004287:0.0599707,(10020.ENSDORP00000000618:0.138502,(10116.ENSRNOP00000026665:0.0528487,10090.ENSMUSP00000001884:0.0307781)0.99985:0.089983)0.99985:0.018366)0.698647:0.00414256)0.995833:0.06629,(9258.ENSOANP00000012946:0.33344,(13616.ENSMODP00000032549:0.0348012,(9315.ENSMEUP00000011030:0.0138664,9305.ENSSHAP00000003293:0.0185119)0.570293:0.0137766)0.99985:0.143897)0.995833:0.06629);") t4 = PhyloTree("(9593.ENSGGOP00000025542:1,9601.ENSPPYP00000004907:1);") t5 = PhyloTree( "(9371.ENSETEP00000005103:0.0955875,(9785.ENSLAFP00000014743:0.0214619,(9813.ENSPCAP00000005573:0.0376639,(9796.ENSECAP00000019319:0.0196571,(37347.ENSTBEP00000012329:0.0242927,((9361.ENSDNOP00000011716:0.0676669,(9606.ENSP00000374323:9e-07,(9593.ENSGGOP00000028731:0.00246332,(61853.ENSNLEP00000002377:0.0030064,(9601.ENSPPYP00000015233:0.0112606,(9598.ENSPTRP00000026129:0.00246268,9483.ENSCJAP00000015834:0.0290829)0:1.2e-07)0:6.5e-07)0.146278:0.00614181)0.146329:0.00485474)0.991187:0.014264)0.763764:0.00352544,((10020.ENSDORP00000008692:0.0259566,(30608.ENSMICP00000002718:0.0380742,9478.ENSTSYP00000009200:0.0174548)0.197348:0.00155005)0.99985:0.0110622,((((132908.ENSPVAP00000013183:0.0099908,59463.ENSMLUP00000014424:0.0115111)0.99985:0.00655941,(10141.ENSCPOP00000003417:0.0535498,((9669.ENSMPUP00000002651:0.0156675,(9646.ENSAMEP00000014393:0.0142536,9615.ENSCAFP00000013394:0.00243184)0.930921:0.00345947)0.99985:0.015828,(9913.ENSBTAP00000053531:0.0545233,9739.ENSTTRP00000001508:0.0344514)0.985783:0.00536759)0:1.1e-07)0:1.1e-07)0.99985:0.00795592,(10090.ENSMUSP00000066734:0.0572278,(43179.ENSSTOP00000020881:0.021661,30611.ENSOGAP00000000479:0.00876016)0.955042:0.00724791)0.992776:0.0044053)0:3.4e-07,(9258.ENSOANP00000012014:0.10692,(9315.ENSMEUP00000001901:0.0451997,13616.ENSMODP00000021214:0.00830289)0.994926:0.0229072)0.99985:0.0500253)0.981032:0.00621499)0:9e-08)0.723103:0.00185076)0.580248:0.00162611)0.99985:0.0167207)0.863552:0.00574499)1:0.0955875);") t6 = PhyloTree( "((9305.ENSSHAP00000010229:0.0607855,13616.ENSMODP00000009656:0.0615237)0.99985:0.0877765,(9785.ENSLAFP00000028174:0.0885004,(((9823.ENSSSCP00000002806:0.0860827,9823.ENSSSCP00000002780:0.0111508)0.99985:0.122086,((9913.ENSBTAP00000038896:0.050358,(9685.ENSFCAP00000017257:0.0778567,(9986.ENSOCUP00000017975:0.161424,(9615.ENSCAFP00000020783:0.056902,(9646.ENSAMEP00000019763:0.0857189,9669.ENSMPUP00000019474:0.0325693)0.99985:0.0314116)0.875671:0.00690881)0.942895:0.0136375)0.798192:0.00741364)0.967573:0.0100004,(59463.ENSMLUP00000020576:0.0755216,9796.ENSECAP00000004613:0.0777605)0.799782:0.00471384)0.911021:0.00832673)0.659845:0.00664335,((43179.ENSSTOP00000021465:0.123042,9593.ENSGGOP00000020601:0.0781752)0.987812:0.0311266,(30611.ENSOGAP00000021055:0.090792,(10116.ENSRNOP00000016702:0.0112116,10090.ENSMUSP00000050705:0.0330259)0.99985:0.134681)0.972881:0.0174783)0.998643:0.0179346)0.901179:0.017737)0.99985:0.0877765);") t7 = PhyloTree( "(9258.ENSOANP00000017269:0.144169,(((10090.ENSMUSP00000089169:0.0424834,10116.ENSRNOP00000026070:0.0151696)0.99985:0.0742333,(((((132908.ENSPVAP00000008558:0.0138473,(30608.ENSMICP00000004293:1.5e-07,((9986.ENSOCUP00000020707:0.0691049,37347.ENSTBEP00000002617:0.0138881)0:1.2e-07,(9371.ENSETEP00000012957:0.0515389,(9785.ENSLAFP00000009919:0.0260641,9813.ENSPCAP00000013834:0.0329521)0.741149:0.0041225)0.998768:0.00855745)0.99985:0.0111961)0.867255:0.00524663)0:4.3e-07,(9361.ENSDNOP00000010929:0.0359312,(9739.ENSTTRP00000015818:0.0267351,9796.ENSECAP00000009501:0.0168218)0.868862:0.00355516)0:8e-08)0.99985:0.0056594,(9913.ENSBTAP00000012912:0.0231165,(9669.ENSMPUP00000002012:0.00320767,9823.ENSSSCP00000023102:0.0629927)0.99134:0.00309237)0.988361:0.00284581)0:1.5e-07,((59463.ENSMLUP00000015155:0.0360776,9615.ENSCAFP00000002053:0.00579656)0.961397:0.00553059,(9685.ENSFCAP00000023114:0.0115974,9646.ENSAMEP00000004090:0.00575272)0.959045:0.00279601)0.988458:0.00279093)0.998008:0.00284847,(30611.ENSOGAP00000001383:0.00849776,((9483.ENSCJAP00000006698:0.0114709,(9544.ENSMMUP00000006654:0.00568623,(61853.ENSNLEP00000004122:0.00566385,(9601.ENSPPYP00000021653:0.00853215,(9593.ENSGGOP00000020462:1.8e-07,(9598.ENSPTRP00000035990:1e-08,9606.ENSP00000365550:1e-08)0.99985:0.00282071)0.996162:0.00281965)0:1.7e-07)0:8e-08)0.954037:0.0027827)0.99985:0.00818313,(43179.ENSSTOP00000012068:0.0109022,(9478.ENSTSYP00000008441:0.0132658,10141.ENSCPOP00000000986:0.0564111)0.314526:0.00294575)0:7e-08)0.980721:0.00309462)0.991529:0.00280168)0:1.6e-07)0.99985:0.0483405,(9315.ENSMEUP00000015273:0.00839008,(9305.ENSSHAP00000020642:0.00542335,13616.ENSMODP00000010568:0.101485)0:2.1e-07)0.99985:0.0336521)1:0.144169);") t8 = PhyloTree( "(((9371.ENSETEP00000003671:0.0131637,(9258.ENSOANP00000006745:0.117598,(132908.ENSPVAP00000001122:0.0159907,(30611.ENSOGAP00000013217:0.0071702,(((9823.ENSSSCP00000000042:0.0144457,(9646.ENSAMEP00000009872:0.0154876,9361.ENSDNOP00000012437:0.0817179)0:1e-06)0.998538:0.00765581,(9544.ENSMMUP00000001765:1e-08,(10116.ENSRNOP00000010491:0.0292686,(9669.ENSMPUP00000016236:0.340739,9615.ENSCAFP00000001415:4e-07)0.989009:0.00985882)0:8.7e-07)0:8.7e-07)0.99736:0.00973955,(((9606.ENSP00000379704:1e-08,(9601.ENSPPYP00000013264:0.00772278,9598.ENSPTRP00000024873:1e-08)0:2.3e-07)0.996569:0.00720502,(9913.ENSBTAP00000017531:0.0145949,9739.ENSTTRP00000016448:0.00723237)0.996503:0.00710774)0:4.2e-07,((9593.ENSGGOP00000008768:0.270021,(9785.ENSLAFP00000013194:0.00881524,9478.ENSTSYP00000011482:6.1e-07)0.482225:0.00675219)0.500314:0.00675139,(((59463.ENSMLUP00000002337:0.0319341,30608.ENSMICP00000003266:6.2e-07)0.987498:0.010619,(9796.ENSECAP00000021110:0.0073991,(9986.ENSOCUP00000007142:0.0196352,37347.ENSTBEP00000000333:0.0989537)0:9.5e-07)0:1.09e-06)0.873107:0.00951386,((9685.ENSFCAP00000000826:3e-07,(43179.ENSSTOP00000011619:0.00863897,10090.ENSMUSP00000023095:1e-08)0:1e-08)0.99985:0.132958,(10020.ENSDORP00000013215:0.0339132,10141.ENSCPOP00000011894:4.1e-07)0:4.1e-07)0.524756:0.00714334)0:8.1e-07)0.99985:0.00971634)0:7e-08)0:7e-08)0.772739:0.0177399)0.992096:0.0404786)0.817723:0.0310407)0.522416:0.072068,(9305.ENSSHAP00000014579:0.246289,9315.ENSMEUP00000008760:0.0666798)0.977479:0.195421)0.99985:1.2587,((((37347.ENSTBEP00000000946:0.0956163,(9483.ENSCJAP00000024301:0.0743892,(9593.ENSGGOP00000012469:0.00721405,(9606.ENSP00000391249:1e-08,9606.ENSP00000461549:1e-08)0:1.3e-07)0.993649:0.00856538)0.99985:0.0230549)0.975176:0.0143781,(30611.ENSOGAP00000003324:0.104251,30608.ENSMICP00000007369:0.0381575)0.990656:0.0183563)0.916137:0.00581305,(9823.ENSSSCP00000018191:0.0558998,((10020.ENSDORP00000010153:0.197695,((9796.ENSECAP00000018039:0.0363101,132908.ENSPVAP00000013461:0.0941126)0.892367:0.013635,((9739.ENSTTRP00000004783:0.0138565,9913.ENSBTAP00000003415:0.0166473)0.99985:0.0326524,((9371.ENSETEP00000006140:0.107709,(9785.ENSLAFP00000006435:0.170692,9813.ENSPCAP00000005503:0.0655274)0:2.68e-06)0.99985:0.0526328,(9258.ENSOANP00000002804:0.150016,(9315.ENSMEUP00000001056:0.0197146,(13616.ENSMODP00000002021:0.0382813,9305.ENSSHAP00000007534:0.0357616)0.99985:0.0843541)0.99985:0.115238)0.99985:0.133971)0.964252:0.0135998)0.99559:0.0163904)0.732303:0.00993157)0.99985:0.0470037,(9685.ENSFCAP00000008713:0.124988,(9615.ENSCAFP00000007771:0.0225216,(9646.ENSAMEP00000014479:0.0718956,9669.ENSMPUP00000013273:0.0487162)0.99985:0.0148769)0:9.2e-07)0.99985:0.0433867)0.99277:0.027679)0.99985:0.0134312)0:4.7e-07,(43179.ENSSTOP00000019919:0.152642,((10116.ENSRNOP00000003891:0.158016,10090.ENSMUSP00000091435:0.0102936)0.99985:0.0704992,(10141.ENSCPOP00000011436:0.130601,9986.ENSOCUP00000015843:0.529405)0:5.42e-06)0.909203:0.011833)0.428577:0.0186403)0.99985:1.2587);") t9 = PhyloTree("(9305.ENSSHAP00000009662:1,9305.ENSSHAP00000009620:1);") t10 = PhyloTree("((9315.ENSMEUP00000008285:0.899711,9258.ENSOANP00000027752:0.559777)0.99985:0.11989,((9739.ENSTTRP00000010720:0.164873,9913.ENSBTAP00000003500:0.298158)0.99985:0.109903,((9685.ENSFCAP00000006440:0.239731,(9615.ENSCAFP00000042310:0.122399,(9646.ENSAMEP00000002314:0.18278,9669.ENSMPUP00000005544:0.270727)0.6117:0.0396991)0.99985:0.0702148)0.99985:0.082488,(132908.ENSPVAP00000014833:0.488081,(9796.ENSECAP00000022144:0.310699,(((9785.ENSLAFP00000009512:0.187095,9813.ENSPCAP00000004417:0.493329)0.99985:0.359095,(30611.ENSOGAP00000016876:0.334272,(9483.ENSCJAP00000021314:0.178043,(9601.ENSPPYP00000003401:0.0415077,((61853.ENSNLEP00000003253:0.196659,9544.ENSMMUP00000037769:0.326984)0.835225:0.0989423,(9593.ENSGGOP00000004740:0.101826,9606.ENSP00000182290:0.0204981)0.997196:0.020731)0.307827:0.0046059)0.99985:0.0991112)0.99985:0.162323)0.972253:0.0380139)0.70642:0.0193389,((10141.ENSCPOP00000016274:0.272126,43179.ENSSTOP00000015376:0.458416)0.996119:0.0901785,(37347.ENSTBEP00000013312:0.328061,(10020.ENSDORP00000010739:0.398341,(10116.ENSRNOP00000051746:0.0455948,10090.ENSMUSP00000009396:0.0811741)0.99985:0.269525)0.791467:0.0577236)0.536676:0.0461933)0.99985:0.0620583)0.99985:0.0788824)0.969465:0.0395994)0.635969:0.0171601)0.702925:0.0283261)0.99985:0.11989);") trees = [(t1, "t1", True), (t2, "t2", False), (t3, "t3", True), (t4, "t4", True), (t5, "t5", True), (t6, "t6", False), (t7, "t7", True), (t8, "t8", True), (t9, "t9", False), (t10, "t10", True)] for tree, tree_name, has_matches in trees: tree.set_species_naming_function(lambda n: n.name.split(".")[0] if "." in n.name else '') tree.annotate_ncbi_taxa() # Has support for two primates where at least one is not H**o sapiens pattern = """ ( ' 9443 in @.lineage ' , ' 9443 in @.lineage and @.name!=9606 ' )' @.support >= 0.9 '; """ pattern = TreePattern(pattern) if not has_matches: self.assertEqual(list(pattern.find_match(tree)), []) else: match = pattern.find_match(tree).next() self.assertEqual(match.support >= 0.9, True) test_status = (9443 in match.children[0].lineage and \ 9443 in match.children[1].lineage and \ match.children[1].name != '9606') # permute children and check again test_status2 = (9443 in match.children[1].lineage and \ 9443 in match.children[0].lineage and \ match.children[0].name != '9606') self.assertEqual(test_status, True) self.assertEqual(test_status2, True)
def process_family_tree(fam_tree_fileName, prune_re, out_dirName): fam_tree = PhyloTree(fam_tree_fileName, format=1) fam_tree_id = os.path.splitext(os.path.basename(fam_tree_fileName))[0] print fam_tree_id leaf_arr = get_node_leaves(fam_tree) prune_seq_arr = get_sequences_for_pruning(leaf_arr, prune_re) try: prune_tree(prune_seq_arr, fam_tree) except: return 0 fam_tree.write(format=1, outfile=out_dirName + "/" + fam_tree_id)
def root_tree(self): outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + BasePath.fasttree_fileextension fam_tree = PhyloTree(fam_tree_filename, format=1) outgrp_re = re.compile(outgrp_regex_str) ingrp_re = re.compile(ingroup_regex_str) outgroup_sequence_list = self.get_regex_matching_sequence_list_from_node( fam_tree, outgrp_re) outgroup_monophyly_check = fam_tree.check_monophyly( values=outgroup_sequence_list, target_attr="name") if outgroup_monophyly_check[0]: print "Outgroups are monophyletic" root_node = fam_tree.get_common_ancestor(outgroup_sequence_list) fam_tree.set_outgroup(root_node) self.write_rooted_tree(fam_tree) else: print "Outgroups are not monophyletic" outgroup_sequence_list_from_seqlist = self.get_outgroup_sequences_from_seqlist( ) arranged_outgroup_sequence_list = self.arrange_outgroup_sequence_ids( outgroup_sequence_list_from_seqlist, outgroup_id_arr) root_node = arranged_outgroup_sequence_list[0] print "Rooting using sequence {0}".format(root_node) fam_tree.set_outgroup(root_node) self.write_rooted_tree(fam_tree)
def yesMake(cut_list, gene, tree_file_name): tree=PhyloTree(tree_file_name) tree.prune(cut_list, preserve_branch_length=True) if gene[-3] == "_": n = int(gene[-1])+1 gene1 = gene[:-1]+str(n) else: gene1 = gene+"_10" new_file = "{}/{}/{}.3.fa.tre".format(sys.argv[1], gene1, gene1) directory = ("{}/{}".format(sys.argv[1], gene1)) os.system("mkdir {}".format(directory)) tree.write(format=1, outfile=new_file) with open(sys.argv[3], "a") as master: master.write("{}\n".format(gene1)) return(gene1)
def tax_node(name, rel_abund=0): """ TODO update this documentation Create a node on the taxonomic tree Args: taxid (string): The taxid of the node Returns: The newly created node """ node = PhyloTree() node.name = name node.rel_abund = rel_abund return node
def get_order(tree): mytree = PhyloTree(tree, format=1) distances = dict() for mynode in mytree.traverse(): if mynode.is_leaf(): continue one_leaf = mynode.get_leaves()[0] dist = mynode.get_distance(one_leaf) distances[mynode.name] = dist node_order = sorted(distances.items(), key=lambda x: x[1]) node_order = [x[0] for x in node_order][::-1] return ",".join(node_order)
def get_ingroup_monoplyletic_clades(self): outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \ BasePath.rooted_fasttree_fileextension outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def prune_main(gene, speciesList, cladeDict): gene = str(gene) erase_previous_files(gene) copy_list = copies_in_group(gene) gene_type = count_summarize(gene, copy_list, speciesList, cladeDict) choice = "n" if gene_type == "small": small_family(gene) elif gene_type == "single": single_copy(gene, copy_list, cladeDict) else: print("\nShowing the gene tree.") clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") view_rooted_tree(clade_tree) choice2 = raw_input( "\nWould you like to split this gene family into multiple families? (y/n)" ) if choice2[0] == "y": pre_prune(gene) else: choice = raw_input( "\nContinue with pruning as single gene family? (y/n)") if choice[0] == "y": make_clade_groups(gene, cladeDict, copy_list, speciesList) make_all_lists(gene, cladeDict)
def make_other_groups(gene, species_keep, species_list): full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") ######Checking if the list is empty###### if len(species_keep) == 0: print("\nThere are no other genes in this gene family.") else: ######Removing stray within-clade gene copies from the clade###### cut_list = cut_stray_other(gene, species_keep, species_list) ######Making it a group###### group_list = cut_list ######Checking that there is only one gene per species###### check_set = {str(item[0:3]) for item in group_list} while len(group_list) != len(check_set): view_counts(cut_list, species_list) group_str = raw_input( "\nYou can only have one gene per species. Enter more genes to cut, separated by a space: " ) group_list = [item for item in group_list if item not in group_str] check_set = {str(item[0:3]) for item in group_list} print("\nThere are " + str(len(group_list)) + " genes in this group.\nGroup looks like:") print(group_list) print("\nMaking the group.") ######Saving gene group as a file###### with open(gene + "/" + gene + "_noclade_prune.txt", "a") as group_file: for i in group_list: group_file.write(i + "\n") ######Saving name of group to a master list###### with open(gene + "/" + gene + "_master_tree_list.txt", "a") as master: master.write(gene + "_noclade\n")
def main(args): genome_names = load_genome_names_by_clade_name(args.clade_name) LOGGER.info("loaded {} {} genomes".format(len(genome_names), args.clade_name)) cdss = load_cdss_by_genome_names(genome_names) LOGGER.info("loaded {} cdss".format(len(cdss))) ortho_fp = pathlib.Path(build_clade_filepath(args.clade_name)).joinpath("./ortho/{}.ortho".format(args.clade_name)) ortho_df = pd.read_csv(ortho_fp, sep='\t') cdss = set_gene_name_to_cdss(cdss, ortho_fp) LOGGER.info("loaded orthology from {}".format(ortho_fp)) if args.split_fp: cdss = set_split_to_cdss(cdss, args.split_fp) LOGGER.info("loaded simulated segmentation from {}".format(args.split_fp)) tree = None if args.tree_fp: tree = PhyloTree(args.tree_fp, format=1) LOGGER.info("loaded phylogenetic tree from {}".format(args.tree_fp)) records = [] cdsDAO = CdsDAO(cdss) gene_names = sorted(set(ortho_df["gene_name"])) # gene_names = list(gene_names)[:100] LOGGER.info("found {} genes to search".format(len(gene_names))) for origin_gene_name in gene_names: LOGGER.info("start {}".format(origin_gene_name)) records += detect_edges_all(origin_gene_name, args.score_method, cdsDAO, tree) out_df = pd.DataFrame(records, columns=["x", "y", "score", "score_naive", "total", "found", "bls", "top_offset", "top_relationship", "top_ratio"]) out_df.to_csv(args.out_fp, sep='\t', index=False) LOGGER.info("saved results to {}".format(args.out_fp))
def get_ingroup_monoplyletic_clades(self): outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + BasePath.raxml_tree_fileprefix + self.fam_id outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def score_family_tree(self): outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id fam_tree = PhyloTree(fam_tree_filename, format=1) outgrp_re = re.compile(outgrp_regex_str) ingrp_re = re.compile(ingroup_regex_str) flag = self.check_if_tree_contains_outgroups(fam_tree, outgrp_re) if flag == 1: return 0 ingroup_matches_arr = self.get_ingroup_sequence_list( fam_tree, ingrp_re) ingroup_pair_arr = self.get_ingroup_sequence_pairs(ingroup_matches_arr) precision_val = self.inspect_ingroup_pairs(fam_tree, ingroup_pair_arr, outgrp_re) tree_score_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \ BasePath.tree_score_fileextension tree_score_file = open(tree_score_filename, "w") tree_score_file.write(self.fam_id + " " + str(precision_val) + "\n") tree_score_file.close()
def __init__(self, newick, alg, taxid, tid, actions, style, predraw_fn=None): try: self.tree = PhyloTree(newick=newick, alignment=alg, alg_format="fasta") except NewickError: self.tree = Tree(newick, format=1) if predraw_fn: predraw_fn(self.tree) self.tree.actions = actions self.tree.tree_style = style self.taxid = taxid #print taxid self.treeid = tid self.mapid = "map_" + tid self.imgid = "img_" + tid self.boxid = 'box_' + tid # Initialze node internal IDs for index, n in enumerate(self.tree.traverse('preorder')): n._nid = index
def get_ingroup_monoplyletic_clades(self): print 'Clade species representation cutoff {0}'.format(self.species_representaion_cutoff) outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(BasePath.species_profile_filename) fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def safe_phylo_read(filename) -> PhyloTree: if isinstance(filename, PhyloTree): return filename try: return PhyloTree(filename, format=3) except: try: return PhyloTree(filename) except: try: return PhyloTree(filename, format=1) except: try: return PhyloTree(filename, format=5) except NewickError as e: print(f"Are you sure tree {filename} exists?", file=sys.stderr, flush=True) raise e
def test_species(self): """ tests if node.species and ncbi_query are working """ # test node.species species_tree = PhyloTree( """(Felis_catus_1:1, (Homo_sapiens_1:1, Pan_troglodytes_1:1), Saccharomyces_cerevisiae_1:1);""", format=1) species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '') pattern0 = """('', (' len(set(["sapiens","pygmaeus"]) & species(@))>0', Pan_troglodytes_1) );""" pattern0 = TreePattern(pattern0) root = species_tree.get_tree_root() self.assertEqual(list(pattern0.find_match(species_tree)), [root]) # test ncbi taxonomy ncbi = NCBITaxa() taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name) taxonomy_tree.annotate_ncbi_taxa() root = taxonomy_tree.get_tree_root() pattern1 = """ ' @.sci_name == "Euarchontoglires" ';""" pattern2 = """ (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ') ' @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage'; """ pattern1 = TreePattern(pattern1) pattern2 = TreePattern(pattern2) match1 = pattern1.find_match(taxonomy_tree) match2 = pattern2.find_match(taxonomy_tree) self.assertEqual(list(match1), [root]) self.assertEqual(list(match2), [root])
def open_tree(tree_file_path): """Opens tree (contree or treefile) and assigns support values to nodes in case of a standard tree file""" if 'contree' in tree_file_path: tree = PhyloTree(tree_file_path, sp_naming_function=None) elif 'treefile' in tree_file_path: # Branch supports in SH-aLRT support (%) / ultrafast bootstrap support (%) tree = PhyloTree(tree_file_path, sp_naming_function=None, format=1) for node in tree.iter_descendants(): if not node.is_leaf(): support_values = node.name.split('/') try: node.support = float(support_values[1]) except IndexError: # No support values when sequences were identical --> set support artifically to 100.0 node.support = 100.0 #node.add_features(shalrt = float(support_values[0])) # Not necessary... else: sys.exit('Error: tree format not recognised') return tree
def test_shortcut_functions(self): t = PhyloTree( """((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))), ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);""") t.set_species_naming_function(lambda node: node.name.split("_")[0]) t.get_descendant_evol_events() # DDDSSSDDS root = t.get_tree_root() # Detects two consecutive nodes with duplications pattern0 = """('n_duplications(@) > 0')'n_duplications(@) > 0 '; """ pattern1 = """( 'contains_leaves(@, ["Chimp_2", "Chimp_3"])'); """ pattern2 = """'n_speciations(@) > 3 '; """ pattern0 = TreePattern(pattern0) pattern1 = TreePattern(pattern1) pattern2 = TreePattern(pattern2) pattern0_match = list(pattern0.find_match(t, maxhits=None)) pattern1_match = list(pattern1.find_match(t, maxhits=None)) pattern2_match = list(pattern2.find_match(t, maxhits=None)) self.assertEqual(len(pattern0_match), 5) self.assertEqual(len(pattern1_match), 4) self.assertEqual(pattern1_match[0], root) self.assertEqual(len(pattern2_match), 2) self.assertEqual(pattern2_match[0], root) self.assertEqual(pattern2_match[1], root.children[0])
def extract_ortho_from_trees(filename): # prepare output variables l_ortho = list() l_ortho_para = list() # load dict of trees tmp_d = utils.get_pickle(Path('dir_step2') / 'dict_trees' / filename) # analyse trees 1 by 1 for ref_leaf, newick in tmp_d.items(): # load tree and get all leaves tree = PhyloTree(newick) all_leaves = {leaf.name for leaf in tree} # get all leaves from last interesting nodes ref_node = tree.search_nodes(name = ref_leaf)[0] ortho = custom_species_overlap(ref_node) # add ref_leaf to ortho in case no good node selected if len(ortho) == 0: ortho.add(ref_leaf) # get para para = all_leaves - ortho # save ortho xx = list(ortho) xx.sort() for sub in itertools.combinations(xx, 2): pair_int = int(str(len(sub[0])) + sub[0] + sub[1]) l_ortho.append(pair_int) # save ortho@para if there is a paralogous group if para: l_ortho_para.append(' '.join(ortho) + '@' + ' '.join(para)) # save ortho @ para utils.save_pickle(path_tmp / filename, l_ortho_para) # save ortho utils.save_pickle(path_tmp_ortho / filename, l_ortho) return [0,0]
def load_json(fp): data = json.loads(clean_json(fp)) taxonomy = {} count_total = 0 counts = [] for row in data['ubiome_bacteriacounts']: normalise_row(row) counts.append(row['count_norm']) t = PhyloTree() t.name = row['tax_name'] t.add_features(**row) taxonomy[row['taxon']] = t root = taxonomy[min(taxonomy.keys())] count_total = root.count_norm root.alpha = alpha_function(counts) for t in taxonomy.values(): t.add_feature('count_pct', float(t.count_norm) / count_total * 100) parent = t.parent tp = taxonomy.get(parent) if tp is not None: tp.add_child(t) print('loaded {} into tree depth {} diversity {:.2f}'.format( len(taxonomy), len(root), root.alpha)) return root
class DrawTree(object): def __init__(self): parser = argparse.ArgumentParser(description="Draw phylogenetic tree") parser.add_argument('tree_file', action='store', type=str) parser.add_argument('-f', '--img-format', dest='img_format', action='store', required=False, default="png", type=str) parser.add_argument('-r', '--ref', dest='ref_file', action='store', required=True, type=file) self.args = parser.parse_args() self.tree_bname = self.args.tree_file.split('.')[0] self.tree = PhyloTree(self.args.tree_file) self.img_format = self.args.img_format def get_ref(self): code2name = {} for line in self.args.ref_file.read().splitlines(): data = tuple(line.split('\t'))[:2] if len(data) == 2: code2name[data[0]] = data[1] print(code2name) return code2name def draw(self, Ts): circular_style = TreeStyle() #circular_style.mode = "c" circular_style.scale = 20 img_fname = '.'.join([self.tree_bname, self.img_format]) self.tree.render(img_fname, tree_style=Ts, w=400, units="mm") #, tree_style=circular_style) try: subprocess.call(['gpicview', img_fname]) except: pass
def make_species_list(path): t = PhyloTree("{}.3.fa.tre".format(path)) leaves = [] for leaf in t: leaves.append(leaf) l = [str(i) for i in leaves] l = [i.lstrip("\n--") for i in l] l2 = [re.sub("\d", "", i) for i in l] return (l, l2)
def process_family_tree(fam_tree_fileName, profile_fileName): outgrp_regex_str, species_dict = read_profile_file(profile_fileName) outgrp_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_fileName, format=1) if not (detect_multifurcation(fam_tree)): return 0 node_dict = get_ingroup_monophyletic_clade_nodes(fam_tree, outgrp_re) get_SO_duplication_events(fam_tree, node_dict, species_dict, fam_tree_fileName)
def ultrametricer(node_order, tree_file): with open(tree_file) as f: mytree = PhyloTree(f.next().strip(), format=1) # First I get every single leaf leaves = mytree.get_leaves() # The total distance must be: v = len(leaves) # Now we get the expected distances distances = dict() for i, node in enumerate(node_order): distances[node] = i + 1 for node in leaves: distances[node.name] = v # We add the root (that has no name) distances[""] = 0 # We get the root root = mytree.get_tree_root() for node in leaves: #Now I start traversing to the root while (node.up): # The expected distance of this branch is: expected = distances[node.name] - distances[node.up.name] node.dist = expected node = node.up return mytree.write(format=1)
def test(): t = PhyloTree( "((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))), ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);") t.set_species_naming_function(lambda node: node.name.split("_")[0]) pattern = """('')' is_duplication(@) '; """ pattern = TreePattern(pattern, format=8, quoted_node_names=True, functions={'contains_species': contains_species, 'is_duplication': is_duplication, 'is speciation': is_speciation }) #should return 5 results print(len(list(pattern.find_match(t, None, maxhits=None)))) pattern1 = """( 'contains(@, ("Chimp_2", "Chimp_3"))' , 'num_species(@, 2) and num_leaves(@,2)' ); """ tp1 = TreePattern(pattern1, format=8, quoted_node_names=True, functions={'contains': contains, "num_species": number_of_species, "num_leaves": number_of_leaves}) #should return 1 result print(len(list(tp1.find_match(t, None))))
def test_cached_attributes(self): pattern0 = """ '"Gallus_gallus_1" in leaves(@)' ;""" pattern1 = """( '"Hom" in species(@) and n_leaves(@) > 2')'"Pan_troglodytes_1" in leaves(@)';""" pattern0 = TreePattern(pattern0) pattern1 = TreePattern(pattern1) tree = PhyloTree( "((((Anolis_carolinensis_1:1, Gallus_gallus_1:1), (Felis_catus_1:1, (Homo_sapiens_1:1, Pan_troglodytes_1:1)primates)primates), ((Danio_rerio_1:1, (Xenopus_laevis_1:1, Anolis_carolinensis_1:1)), Saccharomyces_cerevisiae_2:1)), Saccharomyces_cerevisiae_1:1);", format=1) root = tree.get_tree_root() pattern0_match = list(pattern0.find_match(tree, maxhits=None)) self.assertEqual(len(pattern0_match), 5) # returns leaf itself self.assertEqual(pattern0_match[0], root) self.assertEqual(pattern0_match[4].name, "Gallus_gallus_1") pattern1_match = list(pattern1.find_match(tree, maxhits=None)) self.assertEqual(len(pattern1_match), 3) self.assertEqual(pattern1_match[0], root) self.assertEqual(pattern1_match[2].children[1].children[1].children[0].name, "Homo_sapiens_1")
def LoadTrees(treeFile, dlm): """Reads and stores phylogenetic trees from a file Parameters ------ treefile: file, file of newick trees, 1 per line outgroup: str, last entry from quartet Returns ------ treelist: obj, ete3 object of trees """ print("loading trees...") treelist = [] with open(treeFile, 'r') as newick: for line in newick: if not line.startswith("NA"): t = PhyloTree(line) t.set_species_naming_function(lambda node: node.name.split(dlm)[0]) treelist.append(t) return(treelist)
def yes_choice(tree_file_name, gene, algae_choice): t=PhyloTree(tree_file_name) R = t.get_midpoint_outgroup() t.set_outgroup(R) gene_names = t.get_leaf_names() if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(t) else: algae_list = [] outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") if outlier_choice[0] == "y": print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade") outlier_list = clade_to_tree(t) other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.") if other_copies != "n": other_list = other_copies.split(" ") outlier_list = outlier_list + other_list else: outlier_list=[] print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.") group_list = clade_to_tree(t) ###tree1 cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list gene1 = yesMake(cut_list, gene, tree_file_name) ###tree2 cut_list1 = [i for i in gene_names if i not in cut_list] cut_list1 = cut_list1 + algae_list + outlier_list gene2 = yesMake(cut_list1, gene1, tree_file_name) with open(sys.argv[2], "r") as f: todo_list=[line.rstrip() for line in f] todo_list=[i for i in todo_list if i != gene] todo_list.append(gene1) todo_list.append(gene2) with open(sys.argv[2], "w") as todo: for i in todo_list: todo.write(i+"\n")
from ete3 import PhyloTree # Reads a phylogenetic tree (using default species name encoding) t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));") # /-Hsa_001 # /--------| # | \-Ptr_001 # /--------| # | | /-Cfa_001 # | \--------| # ---------| \-Mms_001 # | # | /-Dme_001 # \--------| # \-Dme_002 # # Prints current leaf names and species codes print "Deafult mode:" for n in t.get_leaves(): print "node:", n.name, "Species name:", n.species # node: Dme_001 Species name: Dme # node: Dme_002 Species name: Dme # node: Hsa_001 Species name: Hsa # node: Ptr_001 Species name: Ptr # node: Cfa_001 Species name: Cfa # node: Mms_001 Species name: Mms # # We can also use our own leaf name parsing function to obtain species # names. All we need to do is create a python function that takes # node's name as argument and return its corresponding species name. def get_species_name(node_name_string):
MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH """ iphylip_txt = """ 4 76 seqA MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA seqB MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA seqC MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA seqD MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ--- LTNVSHQFMA LTNVSH LTNVSH---- ------ LTNVSH---- ------ -------FMA LTNVSH """ # Load a tree and link it to an alignment. As usual, 'alignment' can # be the path to a file or data in text format. t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta") #We can now access the sequence of every leaf node print "These are the nodes and its sequences:" for leaf in t.iter_leaves(): print leaf.name, leaf.sequence #seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH #seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH #seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH #seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH # # The associated alignment can be changed at any time t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip") # Let's check that sequences have changed print "These are the nodes and its re-linked sequences:" for leaf in t.iter_leaves():
# divergence from the taxonomic tree indicates important evolutionary events like duplications or losses. #load a tree and associated alignment #treefile = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/restricted/hapAndeff/strcutres_and_tcoffeeset_aln_struct.phy_phyml_tree.txtlabels.txt' folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*labels.txt' #folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*/*labels.txt' treefiles = glob.glob(folder) for treefile in treefiles: print treefile colorSepcies = False #alg = '/home/cactuskid/Dropbox/IIB/mergeLineages/phylogeny/hybrid/merged_curate_aln.fasta' t = PhyloTree( treefile, sp_naming_function=None) #, alignment=alg, alg_format="fasta") # Calculate the midpoint node R = t.get_midpoint_outgroup() # and set it as tree outgroup t.set_outgroup(R) def save_obj(obj, name ): with open( name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name ): with open( name + '.pkl', 'r') as f: return pickle.load(f) genedict = load_obj('genedict') speciescolors = load_obj('colors')
''' layout for CodemlTree ''' if hasattr(node, "collapsed"): if node.collapsed == 1: node.img_style["draw_descendants"]= False if node.is_leaf(): if hasattr (node, "sequence"): seqface = MySequenceFace(node.sequence, "nt", fsize=10, col_w=11, interactive=True) faces.add_face_to_node(seqface, node, 1, aligned=True) if __name__ == "__main__": tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(""" >Chimp HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA >Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = {"Human" : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp" : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG" } for l in nt_sequences: (tree & l).nt_sequence = nt_sequences[l] tree.dist = 0
from ete3 import PhyloTree, Tree, TreeStyle from ete3 import add_face_to_node, TextFace, AttrFace, SequenceFace nw = '(0:0, 1:20);' fa = """ >0 AA.. >1 CAA. """ t = PhyloTree(nw, alignment=fa, alg_format='fasta', format=1) ts = TreeStyle() ts.show_branch_length = False ts.show_leaf_name = False ts.draw_guiding_lines = True ts.draw_aligned_faces_as_table = True ts.show_scale = False def my_layout(node): # # add names to all nodes (not just to leaf nodes) # ete3/test/test_treeview/face_rotation.py F = TextFace(node.name, tight_text=True) add_face_to_node(F, node, column=0, position="branch-right") # # add branch lengths # ete3/treeview/qt4_render.py if not node.is_root(): bl_face = AttrFace("dist", fsize=8, ftype="Arial", fgcolor="black", formatter="%0.3g")
def main(args): if args.alignment: t = PhyloTree(args.tree, alignment=args.alignment, alg_format='fasta') else: t = PhyloTree(args.tree) if args.highlight_new: runs = read_runs(args.highlight_new) t.set_outgroup('EM_079422') t.ladderize() ts = TreeStyle() ts.show_leaf_name = False ts.show_branch_support = False ts.layout_fn = layout thick_hz_line = NodeStyle() thick_hz_line["hz_line_width"] = 8 t.set_style(thick_hz_line) #t.children[0].set_style(thick_hz_line) #t.children[1].set_style(thick_hz_line) thick_vt_line = NodeStyle() thick_vt_line["vt_line_width"] = 4 t.set_style(thick_vt_line) # header if not args.hide_annotations: ts.aligned_header.add_face(MyTextFace('Sample identifier', fstyle='Bold', fsize=8, tight_text=False), column = 1) ts.aligned_header.add_face(MyTextFace('Prefecture', fstyle='Bold', fsize=8, tight_text=False), column = 2) ts.aligned_header.add_face(MyTextFace('Sous-prefecture', fstyle='Bold', fsize=8, tight_text=False), column = 3) ts.aligned_header.add_face(MyTextFace('Village', fstyle='Bold', fsize=8, tight_text=False), column = 4) ts.aligned_header.add_face(MyTextFace('Sample received', fstyle='Bold', fsize=8, tight_text=False), column = 5) if args.positions: positions = read_positions(args.positions) alg_header = RulerFace(positions, col_width=11, height=0, # set to 0 if dont want to use values kind="stick", hlines = [0], hlines_col = ["white"], # trick to hide hz line ) ts.aligned_header.add_face(alg_header, 6) #legend if args.legend: legend = {} for s in samples.values(): legend[s['prefec']] = s['prefec__colour'] for p in sorted(legend.keys()): ts.legend.add_face(CircleFace(4, legend[p]), column=0) ts.legend.add_face(MyTextFace(p, fsize=6, tight_text=False), column=1) ts.legend_position=1 if args.circular: ts.mode = "c" ts.arc_start = -180 # 0 degrees = 3 o'clock ts.arc_span = 180 # t.show(tree_style=ts) t.render(args.output, tree_style=ts, w=1024)
from ete3 import PhyloTree # Loads a gene tree and its corresponding species tree. Note that # species names in sptree are the 3 firs letters of leaf nodes in # genetree. gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) print genetree # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| #---------| | \-Mms_001 # | /--------| # | | | /-Hsa_001 # | | | /--------| # | | \--------| \-Ptr_001 # \--------| | # | \-Mmu_001 # | # | /-Ptr_002 # \--------| # | /-Hsa_002 # \--------| # \-Mmu_002 # # Let's reconcile our genetree with the species tree
""") parser.add_argument("--colorbar_save", dest="colorbar_save", type=str, help=""" save path of Colorbar for the heatmap with matplotlib """) args = parser.parse_args() infile = args.infile mode = args.mode newick = args.newick if newick: t = PhyloTree(args.newick) species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile)) taxids = set(species2taxid.values()) else: ncbi = NCBITaxa() taxids = set([ line.strip() for line in open(infile) ]) if args.taxoncolors: taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors)) tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True) tNCBI = tNCBI.search_nodes(name="2759")[0] ncbi.annotate_tree(tNCBI, taxid_attr="name") tax2node = dict([node.taxid, node] for node in tNCBI.traverse())
def pre_prune(gene): full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre") gene_names=full_tree.get_leaf_names() m=100 start_gene="{}_all{}".format(gene,str(m)) os.system("mkdir {}".format(start_gene)) full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene)) m=m+1 l=[start_gene] for item in l: full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item)) view_rooted_tree(full_tree) print("Tree for {}".format(item)) c=raw_input("Split off a monophyletic gene copy? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") while c[0]=="y": if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(full_tree) else: algae_list = [] if outlier_choice[0] == "y": print("\nLet's define the outlier group. ") outlier_list = [] out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)") while out_choice[0] == "y": outlier_list2 = clade_to_tree(full_tree) outlier_list = outlier_list + outlier_list2 out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)") other_choice = raw_input("Are there additional genes in the outlier group? (y/n)") while other_choice[0] == "y": other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.") try: other_list = other_copies.split(" ") outlier_list = outlier_list + other_list except ValueError: other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n") other_choice = raw_input("Are there more genes to enter? (y/n)") else: outlier_list=[] b="{}_all{}".format(gene, str(m)) l.append(b) tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree1.get_midpoint_outgroup() tree1.set_outgroup(R) print("\nFor the monophyletic gene copy:") group_list=clade_to_tree(tree1) group_list=group_list + algae_list + outlier_list gene_names=tree1.get_leaf_names() if len(group_list)==len(gene_names): c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)") if c1=="y": c="n" else: print("\nGroup crosses root. Unable to make group.\nChoose new group.") c="y" else: cut_list=[i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list os.system("mkdir {}".format(b)) tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree2.get_midpoint_outgroup() tree2.set_outgroup(R) tree2.prune(group_list,preserve_branch_length=True) tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b)) tree1.prune(cut_list,preserve_branch_length=True) tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item)) m=m+1 print ("\nTree now looks like this.") view_rooted_tree(tree1) c=raw_input("Split off a monophyletic clade? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") with open(sys.argv[1], "a") as p: for i in l: p.write(i+"\n")
from ete3 import PhyloTree # Creates a gene phylogeny with several duplication events at # different levels. Note that we are using the default method for # detecting the species code of leaves (three first lettes in the node # name are considered the species code). nw = """ ((Dme_001,Dme_002),(((Cfa_001,Mms_001),((((Hsa_001,Hsa_003),Ptr_001) ,Mmu_001),((Hsa_004,Ptr_004),Mmu_004))),(Ptr_002,(Hsa_002,Mmu_002)))); """ t = PhyloTree(nw) print "Original tree:", print t # # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| # | | \-Mms_001 # | | # --| | /-Hsa_001 # | | /--------| # | /--------| /--------| \-Hsa_003 # | | | | | # | | | /--------| \-Ptr_001 # | | | | | # | | | | \-Mmu_001 # | | \--------| # \--------| | /-Hsa_004
from ete3 import PhyloTree # Loads an example tree nw = """ ((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)), (Ptr_002,(Hsa_002,Mmu_002)))); """ t = PhyloTree(nw) print t # /-Dme_001 # /--------| # | \-Dme_002 # | # | /-Cfa_001 # | /--------| #---------| | \-Mms_001 # | /--------| # | | | /-Hsa_001 # | | | /--------| # | | \--------| \-Ptr_001 # \--------| | # | \-Mmu_001 # | # | /-Ptr_002 # \--------| # | /-Hsa_002 # \--------| # \-Mmu_002 # # To obtain all the evolutionary events involving a given leaf node we # use get_my_evol_events method matches = t.search_nodes(name="Hsa_001")