def get_front_score(node): front_labels = tree_utils.get_front_labels(node) num_labels = len(front_labels) num_taxa = len(set([tree_utils.get_name(i) for i in front_labels])) if num_taxa == num_labels: return num_taxa return -1
def get_back_score(node,root): back_labels = tree_utils.get_back_labels(node,root) num_labels = len(back_labels) num_taxa = len(set([tree_utils.get_name(i) for i in back_labels])) if num_taxa == num_labels: return num_taxa return -1
def mask_monophyletic_tips(curroot,ignore=[]): going = True while going and curroot != None and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): # walk through nodes if not node.istip: continue # only look at tips name = get_name(node.label).split("_")[1] for sister in node.get_sisters(): if sister.istip and name==get_name(sister.label).split("_")[1]: # mask node = sister.prune() if len(curroot.leaves()) >= 4: if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1): node,curroot = remove_kink(node,curroot) going = True break return curroot
def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"): """ Read final homolog write individual alignment files for each ortholog Shorten seq id to taxon id """ if alndir[-1] != "/": alndir += "/" if tredir[-1] != "/": tredir += "/" if outdir[-1] != "/": outdir += "/" filecount = 0 for i in os.listdir(tredir): if i.endswith(ortho_tree_file_ending): filecount += 1 print i #read in the alignment into an dictionary seqDICT = {} #key is seqID, value is seq for s in read_fasta_file(alndir + i.split(".")[0] + ".fa.mafft.aln"): seqDICT[s.name] = s.seq #read in tree tips and write output alignment with open(tredir + i, "r") as infile: intree = newick3.parse(infile.readline()) labels = tree_utils.get_front_labels(intree) with open(outdir + i.replace(ortho_tree_file_ending, ".aln"), "w") as outfile: for lab in labels: outfile.write(">" + tree_utils.get_name(lab) + "\n" + seqDICT[lab] + "\n") assert filecount > 0,\ "No file ends with "+ortho_tree_file_ending+" was found in "+tredir
def mask_paraphyletic_tips(curroot,ignore=[]): going = True while going and curroot != None and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): #walk through nodes if not node.istip: continue #only look at tips name = get_name(node.label).split("_")[1] parent = node.parent if node == curroot or parent == curroot or parent == None: continue #no paraphyletic tips for the root for para in parent.get_sisters(): if para.istip and name==get_name(para.label).split("_")[1]: # mask node = para.prune() if len(curroot.leaves()) >= 4: if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1): node,curroot = remove_kink(node,curroot) going = True break return curroot
def mask_monophyletic_tips(curroot, unamb_chrDICT): going = True while going and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): #walk through nodes if not node.istip: continue #only look at tips for sister in node.get_sisters(): if sister.istip and get_name(node.label) == get_name( sister.label): #masking #print node.label,unamb_chrDICT[node.label],sister.label,unamb_chrDICT[sister.label] if unamb_chrDICT[node.label] > unamb_chrDICT[sister.label]: node = sister.prune() else: node = node.prune() if len(curroot.leaves()) >= 4: if (node == curroot and node.nchildren == 2) or ( node != curroot and node.nchildren == 1): node, curroot = remove_kink(node, curroot) going = True break return curroot
def mask_paraphyletic_tips(curroot, unamb_chrDICT): going = True while going and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): #walk through nodes if not node.istip: continue #only look at tips parent = node.parent if node == curroot or parent == curroot: continue #no paraphyletic tips for the root for para in parent.get_sisters(): if para.istip and get_name(node.label) == get_name(para.label): if unamb_chrDICT[node.label] > unamb_chrDICT[para.label]: node = para.prune() else: node = node.prune() if len(curroot.leaves()) >= 4: if (node == curroot and node.nchildren == 2) or ( node != curroot and node.nchildren == 1): node, curroot = remove_kink(node, curroot) going = True break return curroot
def mcl_to_fasta(all_fasta,mcl_outfile,minimal_taxa,outdir): #print "Reading mcl output file" clusterDICT = {} #key is seqID, value is clusterID minimal_taxa = int(minimal_taxa) count = 0 with open(mcl_outfile,"r") as infile: for line in infile: if len(line) < 3: continue #ignore empty lines spls = line.strip().split('\t') if len(set(get_name(i) for i in spls)) >= minimal_taxa: count += 1 for seqID in spls: clusterDICT[seqID] = str(count) #print "Reading the fasta file",all_fasta for s in read_fasta_file(all_fasta): try: clusterID = clusterDICT[s.name] with open(outdir+"cluster"+clusterID+".fa","a") as outfile: outfile.write(">"+s.name+"\n"+s.seq+"\n") except: pass # Seqs that did not go in a cluster with enough taxa
def concatenate(clnDIR,numofsitesFilter,numoftaxaFilter,outname): """filter cleaned alignments and concatenate""" if clnDIR[-1] != "/": clnDIR += "/" sites_filter = int(numofsitesFilter) taxa_filter = int(numoftaxaFilter) print "Filtering ortholog matrixes" selected = [] # list of alignment file names that pass the filters for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): seqlist = read_fasta_file(clnDIR+i) num_seq = len(seqlist) num_col = len(seqlist[0].seq) if num_seq >= taxa_filter and num_col >= sites_filter: selected.append(i) print len(selected),"matrices passed the filter" print "Getting matrix occupancy stats" taxon_occupancy = {} #key is taxon name, value is [times present in a matrix,total length for this taxon] total_aligned_len = 0 #record how long the final concatenated matrix is cmd = "pxcat"+" -o "+outname+".fa"+" -p "+outname+".model"+" -s " for i in selected: cmd += clnDIR+i+" " seqlist = read_fasta_file(clnDIR+i) total_aligned_len += len(seqlist[0].seq) for s in seqlist: taxonid = get_name(s.name) if taxonid not in taxon_occupancy: taxon_occupancy[taxonid] = [0,0] taxon_occupancy[taxonid][0] += 1 taxon_occupancy[taxonid][1] += len((s.seq.replace("-","")).replace("?","")) cmd += "\n" total_ortho = len(selected) with open(outname+"_taxon_occupancy_stats","w") as outfile: outfile.write("taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n") sum_char = 0 for taxon in taxon_occupancy: times,chars = taxon_occupancy[taxon][0],taxon_occupancy[taxon][1] sum_char += chars out = taxon+"\t"+str(times)+"\t"+str(chars)+"\t" out += str(times/float(total_ortho))+"\t"+str(chars/float(total_aligned_len))+"\n" outfile.write(out) total_taxa = len(taxon_occupancy) out = "\nSupermatrix dimension "+str(total_taxa)+" taxa, " out += str(total_ortho)+" loci and "+str(total_aligned_len)+" aligned columns\n" out += "Overall matrix occupancy "+str(sum_char/float(total_taxa*total_aligned_len))+"\n" outfile.write(out) print "Supermatrix taxon occupancy stats written to",outname+"_taxon_occupancy_stats" print "Waiting for concatenation to finish. This may take several minutes..." with open(outname+".temp.sh","w") as f: f.write(cmd) os.system("bash "+outname+".temp.sh") #writes phy file cmd_pxs2phy = ["pxs2phy","-o",outname+".phy","-s",outname+".fa"] print (" ".join(cmd_pxs2phy)) os.system(" ".join(cmd_pxs2phy)) #writes nex file cmd_pxs2nex = ["pxs2nex","-o",outname+".nex","-s",outname+".fa"] print (" ".join(cmd_pxs2nex)) os.system(" ".join(cmd_pxs2nex)) assert os.path.exists(outname+".phy") and os.path.exists(outname+".nex") and os.path.exists(outname+".fa"), "error concatenate" os.system("rm "+outname+".temp.sh") print "outfiles written",outname+".phy",outname+".model"
def concatenate(clnDIR, numofsitesFilter, numoftaxaFilter, seqtype, outname): """filter cleaned alignments and concatenate""" if clnDIR[-1] != "/": clnDIR += "/" sites_filter = int(numofsitesFilter) taxa_filter = int(numoftaxaFilter) assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa" model = "AUTO" if seqtype == "aa" else "DNA" print "Filtering ortholog matrixes" selected = [] # list of alignment file names that pass the filters for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): seqlist = read_fasta_file(clnDIR + i) num_seq = len(seqlist) num_col = len(seqlist[0].seq) if num_seq >= taxa_filter and num_col >= sites_filter: selected.append(i) print len(selected), "matrices passed the filter" print "Getting matrix occupancy stats" taxon_occupancy = {} #key is taxon name, value is [times present in a matrix,total length for this taxon] total_aligned_len = 0 #record how long the final concatenated matrix is if seqtype == "aa": cmd = "phyutility -concat -aa -out " + outname + ".nex -in " else: cmd = "phyutility -concat -out " + outname + ".nex -in " for i in selected: cmd += clnDIR + i + " " seqlist = read_fasta_file(clnDIR + i) total_aligned_len += len(seqlist[0].seq) for s in seqlist: taxonid = get_name(s.name) if taxonid not in taxon_occupancy: taxon_occupancy[taxonid] = [0, 0] taxon_occupancy[taxonid][0] += 1 taxon_occupancy[taxonid][1] += len( (s.seq.replace("-", "")).replace("?", "")) cmd += "\n" total_ortho = len(selected) with open(outname + "_taxon_occupancy_stats", "w") as outfile: outfile.write( "taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n" ) sum_char = 0 for taxon in taxon_occupancy: times, chars = taxon_occupancy[taxon][0], taxon_occupancy[taxon][1] sum_char += chars out = taxon + "\t" + str(times) + "\t" + str(chars) + "\t" out += str(times / float(total_ortho)) + "\t" + str( chars / float(total_aligned_len)) + "\n" outfile.write(out) total_taxa = len(taxon_occupancy) out = "\nSupermatrix dimension " + str(total_taxa) + " taxa, " out += str(total_ortho) + " loci and " + str( total_aligned_len) + " aligned columns\n" out += "Overall matrix occupancy " + str( sum_char / float(total_taxa * total_aligned_len)) + "\n" outfile.write(out) print "Supermatrix taxon occupancy stats written to", outname + "_taxon_occupancy_stats" print "Waiting for concatenation to finish. This may take several minutes..." with open(outname + ".temp.sh", "w") as f: f.write(cmd) os.system(cmd) #convert the .nex file to .fasta and .model files for raxml infile = open(outname + ".nex", "r") outfile = open(outname + ".phy", "w") for line in infile: line = line.strip() if len(line) < 10: continue if line[0] == "#" or line[: 5] == "BEGIN" or line[: 6] == "MATRIX" or line == "END;" or line[: 6] == "FORMAT": continue if line[0] == "[": line = line.replace("[", model + ",") line = line.replace(" ]", "") line = line.replace(" cluster", "\n" + model + ",cluster") #line = line.replace(" homolog","\n"+model+",homolog") line = line.replace(" ", "=") with open(outname + ".model", "w") as outfile2: outfile2.write(line.strip() + "\n") #make sure that wc -l will get how many partitions elif line[:10] == "DIMENSIONS": ntax = (line.split("NTAX=")[1]).split(" ")[0] nchar = (line.split("NCHAR=")[1]).replace(";", "") outfile.write(ntax + " " + nchar + "\n") else: spls = line.split("\t") outfile.write(spls[0] + " " + spls[1] + "\n") infile.close() outfile.close() assert os.path.exists(outname + ".phy"), "error concatenate" os.system("rm " + outname + ".temp.sh") print "outfiles written", outname + ".phy", outname + ".model" os.system("rm " + outname + ".nex") #remove intermediate .nex file
def main(): args = get_args() print args indir, outdir, num_cores = args.indir, args.outdir, args.threads relative_cut, absolute_cut, branch_len_cutoff = args.reltip, args.abstip, args.deep assert args.ortho == "RT" or args.ortho == "121", "--ortho has to be 121 or RT" assert os.path.exists( args.inout), "cannot find the file specified by --inout" assert args.test == "y" or args.test == "n", "test has to be either y or n" test = True if args.test == "y" else False if outdir[-1] != "/": outdir += "/" if indir[-1] != "/": indir += "/" logfile = outdir + "homology_inference.log" check_dependencies() # get initial fasta fasta_files = gether_fasta_files(path=indir, file_ending=".cds.fa", logfile=logfile) print len(fasta_files), "data sets read" taxa = set([tree_utils.get_name(i.split(".")[0]) for i in fasta_files]) print len(taxa), "taxa found:" print taxa min_taxa = get_min_taxa(len(taxa), args.max_mis_taxa) print "Minimal number of taxa: ", min_taxa outdir1 = reduce_redundancy(indir, fasta_files, num_cores, outdir, logfile) outdir2, outdir3 = clustering(outdir1, fasta_files, num_cores, outdir, min_taxa, logfile) # tree inference and clearning round 1. if not os.path.exists(outdir + "3_clusters_ok"): refine(curdir=outdir3,nextdir=outdir+"4_refine/",\ prefix="3_clusters",\ num_cores=num_cores,\ relative_cut=relative_cut,\ absolute_cut=absolute_cut,\ branch_len_cutoff=branch_len_cutoff,\ min_taxa=min_taxa,\ mask_para="n",\ logfile=logfile,\ test=test) os.system("touch " + outdir + "3_clusters_ok") # round 2 if not os.path.exists(outdir + "4_refine_fasta_ok"): write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\ treDIR=outdir+"4_refine/",\ tree_file_ending=".subtree",\ outDIR=outdir+"4_refine/") os.system("touch " + outdir + "4_refine_fasta_ok") homodir = outdir + "5_homolog/" if not os.path.exists(outdir + "4_refine_ok"): refine(curdir=outdir+"4_refine/",nextdir=homodir,\ prefix="5_homolog",\ num_cores=num_cores,\ relative_cut=relative_cut,\ absolute_cut=absolute_cut,\ branch_len_cutoff=branch_len_cutoff,\ min_taxa=min_taxa,\ mask_para="y",\ logfile=logfile) os.system("touch " + outdir + "4_refine_ok") # bootstrap the homologs if not os.path.exists(outdir + "5_homolog_fasta_ok"): # tree files that ends with ".subtree" write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\ treDIR=homodir,\ tree_file_ending=".subtree",\ outDIR=homodir) """ # tree files that ends with ".mm" write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\ treDIR=homodir,\ tree_file_ending=".mm",\ outDIR=homodir)""" os.system("touch " + outdir + "5_homolog_fasta_ok") if not os.path.exists(outdir + "5_homolog_bootstrap_ok"): for i in os.listdir(homodir): if i.endswith(".fa"): fasta_to_tree.fasta_to_bs_tree(DIR=homodir,\ fasta=i,\ num_cores=num_cores,\ seqtype="dna") os.system("touch " + outdir + "5_homolog_bootstrap_ok") print "homologs with bootstrap values (.tre) written to " + homodir # get orthologs orthodir = outdir + "6_ortho/" try: os.stat(orthodir) except: os.mkdir(orthodir) if not os.path.exists(outdir + "ortho_tre_ok"): if args.ortho == "121": get_121(indir=homodir,\ tree_file_ending=".tre",\ min_taxa=min_taxa,\ outdir=orthodir,\ min_bootstrap=80.0) os.system("touch " + outdir + "ortho_tre_ok") with open(logfile, "a") as f: f.write( "Filter one-to-one orthologs by minimal bootstrap of 80\n") else: ingroups = 0 with open(args.inout) as infile: for line in infile: if line.startswith("IN\t"): ingroups += 1 RT(homoDIR=homodir,\ tree_file_eneding =".tre",\ outDIR=orthodir,\ MIN_INGROUP_TAXA=get_min_taxa(ingroups,args.max_mis_taxa),\ taxon_code_file=args.inout) os.system("touch " + outdir + "ortho_tre_ok") with open(logfile, "a") as f: f.write("RT orthologs\n") if not os.path.exists(outdir + "ortho_aln_ok"): ortho_to_aln(alndir=homodir,\ tredir=orthodir,\ outdir=orthodir,\ ortho_tree_file_ending=".tre") os.system("touch " + outdir + "ortho_aln_ok") with open(logfile, "a") as f: f.write("Write ortholog alignments from homologs\n") min_col_occup = 0.5 if min_taxa < 10 else 0.3 if not os.path.exists(outdir + "ortho_cln_ok"): phyutility_wrapper.main(DIR=orthodir,\ min_col_occup=min_col_occup,\ seqtype="dna") os.system("touch " + outdir + "ortho_cln_ok") with open(logfile, "a") as f: f.write("Trim ortholog alignments by " + str(min_col_occup) + "\n") # tree inference and jackknife # concatenate if args.ortho == "121": outname = "filter300-" + str(min_taxa) if not os.path.exists(outdir + outname + ".phy"): concatenate(clnDIR=orthodir,\ numofsitesFilter=300,\ numoftaxaFilter=min_taxa,\ seqtype="dna",\ outname=outname) else: min_ingroup_taxa = get_min_taxa(ingroups, args.max_mis_taxa) outname = "filter300-" + str(min_ingroup_taxa) if not os.path.exists(outdir + outname + ".phy"): concatenate(clnDIR=orthodir,\ numofsitesFilter=300,\ numoftaxaFilter=min_ingroup_taxa,\ seqtype="dna",\ outname=outname) # tree if not os.path.exists("RAxML_bestTree." + outname): cmd = ["raxml",\ "-T",str(num_cores),\ "-p","1234",\ "-m","GTRCAT",\ "-q",outname+".model",\ "-s",outname+".phy",\ "-n",outname] print " ".join(cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE) out = p.communicate() assert p.returncode == 0, "Error raxml" + out[0] try: os.remove("RAxML_info." + outname) os.remove("RAxML_log." + outname) os.remove("RAxML_parsimonyTree." + outname) os.remove("RAxML_result." + outname) except: pass # no need to worry about extra intermediate files # jackknife if not os.path.exists("JK5_trees"): jk(indir="./",\ num_core=num_cores,\ resample_num=5,\ seqtype="dna",\ replicates=200) # map jackknife support to species tree cmd = ["raxml", "-f","b",\ "-t", "RAxML_bestTree."+outname,\ "-z", "JK5_trees", "-T", str(num_cores),\ "-m", "GTRCAT",\ "-n", outname+"_JK5"] print " ".join(cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE) out = p.communicate() assert p.returncode == 0, "Error raxml" + out[0] try: os.remove("RAxML_info." + outname + "_JK5") os.remove("RAxML_log." + outname + "_JK5") os.remove("RAxML_parsimonyTree." + outname + "_JK5") os.remove("RAxML_result." + outname + "_JK5") os.remove("RAxML_bipartitionsBranchLabels." + outname + "_JK5") except: pass # no need to worry about extra intermediate files # remove intermediate files os.system("rm " + outdir + "*_ok") os.remove(outdir + "phyutility.log")