def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores): gene_name = get_filename_from_path(query_fasta)[1].split(".")[0] outdir, fasta = get_filename_from_path(start_fasta) #print outdir,fasta deep_paralog_cutoff = float(deep_paralog_cutoff) query_ids = [s.name for s in seq.read_fasta_file(query_fasta)] new_fasta = [] # list of output refined fasta files print outdir, fasta # make a tree from the start_fasta tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa") if tree == None: return [] with open(tree, "r") as infile: intree = newick3.parse(infile.readline()) root = trim_tips.trim(intree, relative_cutoff=deep_paralog_cutoff, absolute_cutoff=deep_paralog_cutoff * 2) if os.path.exists(outdir + fasta + ".pasta.aln-cln"): clnfile = outdir + fasta + ".pasta.aln-cln" else: clnfile = outdir + fasta + ".mafft.aln-cln" root = mask_tips_by_taxonID_transcripts.mask(root,\ clnfile=clnfile,\ para="y", ignore=GENOMES) if root != None: with open(tree + ".tt.mm", "w") as outfile: outfile.write(newick3.tostring(root) + "\n") subtrees = cut_long_internal_branches.cut_long_internal_branches( root, cutoff=deep_paralog_cutoff) count = 0 base_name = fasta.split(".")[0] seqDICT = {} # key is seqid, value is seq for s in seq.read_fasta_file(start_fasta): seqDICT[s.name] = s.seq for tree in subtrees: if tree == None: continue label_set = set(tree_utils.get_front_labels(tree)) if len(label_set) > 4 and len(label_set & set(query_ids)) > 0: count += 1 with open(outdir + base_name + "_" + str(count) + ".subtree", "w") as outfile: outfile.write(newick3.tostring(tree) + ";\n") with open(outdir + base_name + "_" + str(count) + ".fa", "w") as outfile: for seqid in tree_utils.get_front_labels(tree): try: outfile.write(">" + seqid + "\n" + seqDICT[seqid] + "\n") except: print seqid, "not found in fasta file" new_fasta.append(outdir + base_name + "_" + str(count) + ".fa") return new_fasta
def taxon_name_subst(original,table=sys.path[0]+"/reverse_taxon_table"): DICT = {} # key is seq acronym, value is full taxon name, separated by tab with open(table, "r") as infile: for line in infile: spls = line.strip().split("\t") if len(spls) > 1: DICT[spls[0].replace("|","_")] = spls[1] with open(original,"r") as infile: line = infile.readline() is_fasta = True if line[0] == ">" else False if is_fasta: # for fasta files infile = open(original,"r") outfile = open(original+".name","w") for line in infile: if line[0] == ">": outfile.write('>'+get_long_id(line.strip()[1:],DICT)+"\n") else: outfile.write(line) infile.close() outfile.close() else: # tree file with open(original,"r") as infile: intree = newick3.parse(infile.readline()) for i in intree.leaves(): print i.label, i.label = get_long_id(i.label,DICT) print i.label with open(original+".name","w") as outfile: outfile.write(newick3.tostring(intree)+";\n")
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING, ignore=GENOMES): if treDIR[-1] != "/": treDIR += "/" if clnDIR[-1] != "/": clnDIR += "/" assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)" mask_para = True if para == "y" else False filecount = 0 filematch = {} #key is clusterID, value is the .aln-cln file for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): clusterID = get_clusterID(i) assert clusterID not in filematch, \ "The clusterID "+clusterID+" repeats in "+clnDIR filematch[clusterID] = i for i in os.listdir(treDIR): if i.endswith(intree_file_ending): with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = get_clusterID(i) filecount += 1 curroot = mask(intree, clnDIR + filematch[clusterID], para=mask_para, ignore=GENOMES) with open(treDIR + i + ".mm", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") assert filecount > 0, \ "No file ends with "+intree_file_ending+" found in "+treDIR
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING): if treDIR[-1] != "/": treDIR += "/" if clnDIR[-1] != "/": clnDIR += "/" assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)" mask_para = True if para == "y" else False filecount = 0 filematch = {} #key is clusterID, value is the .aln-cln file for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): clusterID = get_clusterID(i) assert clusterID not in filematch, \ "The clusterID "+clusterID+" repeats in "+clnDIR filematch[clusterID] = i for i in os.listdir(treDIR): if i.endswith(intree_file_ending): with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = get_clusterID(i) filecount += 1 chrDICT = {} #key is seqid, value is number of unambiguous chrs for s in read_fasta_file(clnDIR + filematch[clusterID]): for ch in ['-', 'X', "x", "?", "*"]: s.seq = s.seq.replace(ch, "") #ignore gaps, xs and Xs chrDICT[s.name] = len(s.seq) curroot = mask_monophyletic_tips(intree, chrDICT) if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT) with open(treDIR + i + ".mm", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") assert filecount > 0, \ "No file ends with "+intree_file_ending+" found in "+treDIR
def main(inDIR, file_ending, branch_len_cutoff, min_taxa, outDIR): """cut long branches and output subtrees as .subtre files if uncut and nothing changed betwee .tre and .subtree copy the original .tre file to the outdir""" if inDIR[-1] != "/": inDIR += "/" if outDIR[-1] != "/": outDIR += "/" min_taxa = int(min_taxa) filecount = 0 cutoff = float(branch_len_cutoff) print "cutting branches longer than", cutoff for i in os.listdir(inDIR): if not i.endswith(file_ending): continue print i filecount += 1 with open(inDIR + i, "r") as infile: #only 1 tree in each file intree = newick3.parse(infile.readline()) try: with open(inDIR + i[:i.find(".tre")] + ".tre", "r") as infile: #the original .tre raw_tree_size = len( get_front_labels(newick3.parse(infile.readline()))) except: # did not refine this round. Use the .tre.tt.mm tree raw_tree_size = len(get_front_labels(intree)) num_taxa = count_taxa(intree) if num_taxa < min_taxa: print "Tree has", num_taxa, "less than", min_taxa, "taxa" else: print ".tre:", raw_tree_size, "tips; " + file_ending + ": " + str( len(get_front_labels(intree))) + " tips" subtrees = cut_long_internal_branches(intree, cutoff) if len(subtrees) == 0: print "No tree with at least", min_taxa, "taxa" #elif raw_tree_size == len(subtrees[0].leaves()): #copy(inDIR+i,outDIR+i) #print "written to out directory unchanged" else: count = 0 outsizes = "" for subtree in subtrees: if count_taxa(subtree) >= min_taxa: if subtree.nchildren == 2: #fix bifurcating roots from cutting temp, subtree = remove_kink(subtree, subtree) count += 1 with open( outDIR + i.split(".")[0] + "_" + str(count) + ".subtree", "w") as outfile: outfile.write(newick3.tostring(subtree) + ";\n") outsizes += str(len(subtree.leaves())) + ", " print count, "tree(s) wirtten. Sizes:", outsizes assert filecount > 0, "No file end with " + file_ending + " in " + inDIR
def main(DIR, tree_file_ending, relative_cut, absolute_cut): if DIR[-1] != "/": DIR += "/" filecount = 0 for i in os.listdir(DIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(DIR + i, "r") as infile: intree = newick3.parse(infile.readline()) outtree = trim(intree, float(relative_cut), float(absolute_cut)) if outtree != None: with open(DIR + i + ".tt", "w") as outfile: outfile.write(newick3.tostring(outtree) + ";\n") assert filecount > 0, \ "No file end with "+tree_file_ending+" found in "+DIR
def main(DIR,tree_file_ending,relative_cut,absolute_cut1,absolute_cut2): if DIR[-1] != "/": DIR += "/" filecount = 0 for i in os.listdir(DIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(DIR+i,"r") as infile: intree = newick3.parse(infile.readline()) outtree = trim(intree,float(relative_cut),float(absolute_cut1),float(absolute_cut2)) if outtree != None: with open(DIR+i+".tt","w") as outfile: outfile.write(newick3.tostring(outtree)+";\n") assert filecount > 0, \ "No file end with "+tree_file_ending+" found in "+DIR
import phylo3,newick3,os,sys from tree_utils import * if __name__ == "__main__": if len(sys.argv) != 4: print "python prune_paralogs_from_rooted_trees.py homoTreeDIR tree_file_ending minimal_taxa outDIR" sys.exit(0) inDIR = sys.argv[1]+"/" tree_file_ending = sys.argv[2] MIN_TAXA = int(sys.argv[3]) outDIR = sys.argv[4]+"/" for i in os.listdir(inDIR): if not i.endswith(tree_file_ending) continue print i outID = outDIR+get_clusterID(i) with open(inDIR+i,"r") as infile: intree = newick3.parse(infile.readline()) orthologs = get_ortho_from_rooted_inclade(intree) count = 1 for ortho in orthologs: if len(set(get_front_names(ortho))) >= MIN_TAXA: with open(outID+".ortho"+str(count)+".tre","w") as outfile: outstring = newick3.tostring(ortho) #outstring = outstring.replace(":0",":1") outfile.write(outstring+";\n") count += 1
treDIR = sys.argv[1]+"/" clnDIR = sys.argv[2]+"/" outDIR = sys.argv[3]+"/" if sys.argv[4] == "y": mask_para = True elif sys.argv[4] == "n": mask_para = False else: print "mask_para? y/n" sys.exit() filecount = 0 for i in os.listdir(treDIR): if i[-3:] == ".tt" and i[-3:] != ".mm": with open(treDIR+i,"r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = i.split("_")[0] filecount += 1 unamb_chrDICT = {} #key is seqid, value is number of unambiguous chrs with open(clnDIR+clusterID+ALIGNMENT_FILE_ENDING) as handle: for record in SeqIO.parse(handle,"fasta"): seqid,seq = str(record.id), str(record.seq) for ch in ['-','X',"x","?","*"]: seq = seq.replace(ch,"") #ignore gaps, xs and Xs unamb_chrDICT[seqid] = len(seq) curroot = monophyly_masking(intree,unamb_chrDICT) if mask_para: curroot = paraphyly_masking(curroot,unamb_chrDICT) with open(outDIR+i+".mm","w") as outfile: outfile.write(newick3.tostring(curroot)+";\n") if filecount == 0: print "No file name with 'best' or 'tt' or 'fasttree' found in the treDIR"
nc = n while (not nc.istip) and len(nc.children) == 0: print "pruning an empty tip" np = nc.parent nc.prune() if np: nc = np else: break # if not n.istip: # if len(n.children) == 0: # nodes_to_remove.insert(0,n) # else: # empty = True # for c in n.children: # if c not in nodes_to_remove: # empty = False # break # if empty: # nodes_to_remove.insert(0,n) #print "" #for dud in nodes_to_remove: # print "removing an empty tip!" # dud.parent.remove_child(dud) outfile = open(treefname.rsplit(".tre",1)[0] + ".renamed.tre","w") outfile.write(newick3.tostring(tree) + ";") outfile.close()
tip.prune() # compress knuckle if there is one # if len(parent.children) == 1: # child = parent.children[0] # if child.label != None: # rightlabel = child.label # else: # rightlabel = ", ".join([leaf.label for leaf in child.leaves()]) # print "compressing a knuckle in the tree: " + leftlabel + " | " + rightlabel # pp = parent.parent # pp.remove_child(parent) # pp.add_child(child) #nodes_to_remove = [] for n in tree.descendants(): nc = n while (not nc.istip) and len(nc.children) == 0: print "pruning an empty tip" np = nc.parent nc.prune() if np: nc = np else: break outfile = open(treefname.rsplit(".tre",1)[0] + ".pruned.tre","w") outfile.write(newick3.tostring(tree) + ";") outfile.close()
if seqid != "": seqDICT[seqid] = seq seqid,seq = line[1:].replace("-","_"),"" else: seq += line.strip() seqDICT[seqid] = seq #add the last record infile.close() #cut by the first cutoff print i,"Cutting branches longer than",cutoff1 outfile = open(DIR+ccID+".cut1.trees","w") newtrees = [] #store trees in need of cutting for the next round count = 0 while True: for tree in trees: if find_longest_internal_branch_length(tree) < cutoff1: outfile.write(newick3.tostring(tree) +";\n") #can be the original tree or cut tree. no need to cut tip here else: subtrees = cut_long_branches(tree,cutoff1) for subtree in subtrees: if count_ingroups(subtree) < MIN_INGROUP_TAXA: continue count += 1 newname = DIR+ccID+".cut1-"+str(count) with open(newname+".cutbranch","w") as outfile1: #record the cut branch outfile1.write(newick3.tostring(subtree)+";\n") with open(newname+".fa","w") as outfile2: #output fasta for label in get_leaf_labels(subtree.leaves()): outfile2.write(">"+label+"\n"+seqDICT[label]+"\n") newaln = mafft_align(newname+".fa") newcln = phyutility_clean_alignment(newaln) newtreefile = fasttree(newcln)
#check taxonIDs ingroup_names = [] outgroup_names = [] for name in all_names: if name in INGROUPS: ingroup_names.append(name) elif name in OUTGROUPS: outgroup_names.append(name) else: print name,"not in ingroups or outgroups" sys.exit() if len(set(ingroup_names)) < MIN_INGROUP_TAXA: print "not enough ingroup taxa in tree" continue if len(outgroup_names) == 0: print "No outgroup in tree" continue inclades = extract_rooted_ingroup_clades(curroot,INGROUPS,OUTGROUPS,MIN_INGROUP_TAXA) inclade_count = 0 for inclade in inclades: inclade_count += 1 with open(outDIR+treefile+"."+str(inclade_count),"w") as outfile: outfile.write(newick3.tostring(inclade)+";\n") for node in inclade.iternodes(): if node.istip: node.label = get_name(node.label) # output multi-labeled tree for phyparts with open(phypartsDIR+treefile+"."+str(inclade_count),"w") as outfile: outfile.write(newick3.tostring(inclade)+";\n") print inclade_count,"clades extracted"
def mask_paraphyletic_tips(curroot,ignore=[]): going = True while going and curroot != None and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): #walk through nodes if not node.istip: continue #only look at tips name = get_name(node.label).split("_")[1] parent = node.parent if node == curroot or parent == curroot or parent == None: continue #no paraphyletic tips for the root for para in parent.get_sisters(): if para.istip and name==get_name(para.label).split("_")[1]: # mask node = para.prune() if len(curroot.leaves()) >= 4: if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1): node,curroot = remove_kink(node,curroot) going = True break return curroot if __name__ == "__main__": if len(sys.argv) != 3: print "usage: python "+sys.argv[0]+" treefile para(y/n)" sys.exit() intree = newick3.parse(open(sys.argv[1],"r").readline()) masked = mask_monophyletic_tips(intree,ignore=[]) if sys.argv[2] == "y": masked = mask_paraphyletic_tips(masked,ignore=[]) print newick3.tostring(masked)+";\n"
def RT(homoDIR, tree_file_eneding, outDIR, min_ingroup_taxa, taxon_code_file_file): if homoDIR[-1] != "/": homoDIR += "/" if outDIR[-1] != "/": outDIR += "/" min_ingroup_taxa = int(min_ingroup_taxa) INGROUPS = [] OUTGROUPS = [] with open(taxon_code_file_file, "r") as infile: for line in infile: if len(line) < 3: continue spls = line.strip().split("\t") if spls[0] == "IN": INGROUPS.append(spls[1]) elif spls[0] == "OUT": OUTGROUPS.append(spls[1]) else: print "Check taxon_code_file file format" sys.exit() if len(set(INGROUPS) & set(OUTGROUPS)) > 0: print "Taxon ID", set(INGROUPS) & set( OUTGROUPS), "in both ingroups and outgroups" sys.exit(0) print len(INGROUPS), "ingroup taxa and", len( OUTGROUPS), "outgroup taxa read" print "Ingroups:", INGROUPS print "Outgroups:", OUTGROUPS for treefile in os.listdir(homoDIR): if not treefile.endswith(tree_file_eneding): continue with open(homoDIR + treefile, "r") as infile: intree = newick3.parse(infile.readline()) curroot = intree all_names = tree_utils.get_front_names(curroot) num_tips = len(all_names) num_taxa = len(set(all_names)) print treefile #check taxonIDs ingroup_names = [] outgroup_names = [] for name in all_names: if name in INGROUPS: ingroup_names.append(name) elif name in OUTGROUPS: outgroup_names.append(name) else: print name, "not in ingroups or outgroups" sys.exit() if len(set(ingroup_names)) < min_ingroup_taxa: print "not enough ingroup taxa in tree" continue outID = outDIR + tree_utils.get_clusterID(treefile) if len(outgroup_names ) > 0: #at least one outgroup present, root and cut inclades inclades = tree_utils.extract_rooted_ingroup_clades(curroot,\ INGROUPS,OUTGROUPS,min_ingroup_taxa) inclade_count = 0 for inclade in inclades: inclade_count += 1 inclade_name = outID + ".inclade" + str(inclade_count) with open(inclade_name, "w") as outfile: outfile.write(newick3.tostring(inclade) + ";\n") orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade) ortho_count = 0 for ortho in orthologs: if len(tree_utils.get_front_labels( ortho)) >= min_ingroup_taxa: ortho_count += 1 with open( inclade_name + ".ortho" + str(ortho_count) + ".tre", "w") as outfile: outfile.write(newick3.tostring(ortho) + ";\n") elif len(all_names) == num_taxa: #only output ortho tree when there is no taxon repeats with open(outID + ".unrooted-ortho.tre", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") else: #do not attempt to infer direction of gene duplication without outgroup info print "duplicated taxa in unrooted tree"
print "python trim_tips.py treDIR tree_file_ending outDIR relative_cutoff absolute_cutoff1 absolute_cutoff2" sys.exit(0) treDIR = sys.argv[1] + "/" file_ending = sys.argv[2] outDIR = sys.argv[3] + "/" relative_cutoff = float(sys.argv[4]) absolute_cutoff1 = float(sys.argv[5]) absolute_cutoff2 = float(sys.argv[6]) done = [] #record clusterIDs that are done for i in os.listdir(treDIR): if i[-3:] == ".tt": done.append(i.split(".")[0]) print done filecount = 0 l = len(file_ending) for i in os.listdir(treDIR): if file_ending in i: clusterID = i.split(".")[0] if clusterID in done: continue print i filecount += 1 with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) with open(outDIR + i + ".tt", "w") as outfile: outfile.write(newick3.tostring(cut_long_tips(intree)) + ";\n") if filecount == 0: print "No file name with", file_ending, "found in the treDIR"
if sister.istip and get_name(node.label)==get_name(sister.label): if node.length > sister.length: node = node.prune() else: node = sister.prune() if len(curroot.leaves()) >= 4: if node.nchildren==1 or (node==curroot and node.nchildren==2): node,curroot = remove_kink(node,curroot) #no kink if the original node had more than 2 children going = True break return curroot if __name__ == "__main__": if len(sys.argv) != 2: print "python mask_tips_by_taxonID_genomes.py DIR" sys.exit() DIR = sys.argv[1]+"/" filecount = 0 for i in os.listdir(DIR): if i[-3:] == ".tt": #only mask trees that have tips trimmed with open(DIR+i,"r") as infile: intree = newick3.parse(infile.readline()) print i filecount += 1 with open(DIR+i+".mm","w") as outfile: outfile.write(newick3.tostring(monophyly_masking_by_bl(intree))+";\n") if filecount == 0: print "No file name with 'best' or 'tt' or 'fasttree' found in the treDIR"
#check to make sure that the ingroup and outgroup names were set correctly for name in names: if name not in INGROUPS and name not in OUTGROUPS: print "check name",name sys.exit() outgroup_names = get_front_outgroup_names(curroot) #if no outgroup at all, do not attempt to resolve gene duplication if len(outgroup_names) == 0: print "duplicated taxa in unrooted tree" #skip the homolog if there are duplicated outgroup taxa elif len(outgroup_names) > len(set(outgroup_names)): print "outgroup contains taxon repeats" else: #at least one outgroup present and there's no outgroup duplication if curroot.nchildren == 2: #need to reroot temp,curroot = remove_kink(curroot,curroot) curroot = reroot_with_monophyletic_outgroups(curroot) #only return one tree after prunning if curroot != None: with open(outID+".reroot","w") as outfile: outfile.write(newick3.tostring(curroot)+";\n") ortho = prune_paralogs_from_rerooted_homotree(curroot) if len(set(get_front_names(curroot))) >= MIN_TAXA: with open(outID+".ortho.tre","w") as outfile: outfile.write(newick3.tostring(ortho)+";\n") else: print "not enough taxa after pruning" else: print "outgroup non-monophyletic"
filecount = 0 for i in os.listdir(inDIR): if "best" not in i and i[-3:] != ".mm": continue filecount += 1 if "best" in i: clusterID = i.split(".")[1] else: clusterID = i.split(".")[0] print clusterID with open(inDIR+i,"r") as infile: #only 1 tree in each file intree = newick3.parse(infile.readline()) curroot = intree if count_ingroup_taxa(curroot) < min_ingroup_taxa: continue subtrees = cut_long_internal_branches(curroot,branch_len_cutoff) if len(subtrees) > 0: count = 1 for subtree in subtrees: if count_ingroup_taxa(subtree)>=min_ingroup_taxa and count_outgroup_taxa(subtree)>=min_outgroup_taxa: #fix bifurcating roots from cutting if subtree.nchildren == 2: subtree,subtree = remove_kink(subtree,subtree) with open(outDIR+clusterID+"_subtree"+str(count)+".tre","w") as outfile: outfile.write(newick3.tostring(subtree)+";\n") count += 1 if filecount == 0: print "No file end with",file_ending,"found"
print "python trim_tips.py treDIR tree_file_ending outDIR relative_cutoff absolute_cutoff1 absolute_cutoff2" sys.exit(0) treDIR = sys.argv[1]+"/" file_ending = sys.argv[2] outDIR = sys.argv[3]+"/" relative_cutoff = float(sys.argv[4]) absolute_cutoff1 = float(sys.argv[5]) absolute_cutoff2 = float(sys.argv[6]) done = [] #record clusterIDs that are done for i in os.listdir(treDIR): if i[-3:] == ".tt": done.append(i.split(".")[0]) print done filecount = 0 l = len(file_ending) for i in os.listdir(treDIR): if file_ending in i: clusterID = i.split(".")[0] if clusterID in done: continue print i filecount += 1 with open(treDIR+i,"r") as infile: intree = newick3.parse(infile.readline()) with open(outDIR+i+".tt","w") as outfile: outfile.write(newick3.tostring(cut_long_tips(intree))+";\n") if filecount == 0: print "No file name with",file_ending,"found in the treDIR"