Example #1
0
def get_front_score(node):
	front_labels = tree_utils.get_front_labels(node)
	num_labels = len(front_labels)
	num_taxa = len(set([tree_utils.get_name(i) for i in front_labels]))
	if num_taxa == num_labels:
		return num_taxa
	return -1
Example #2
0
def get_back_score(node,root):
	back_labels = tree_utils.get_back_labels(node,root)
	num_labels = len(back_labels)
	num_taxa = len(set([tree_utils.get_name(i) for i in back_labels]))
	if num_taxa == num_labels:
		return num_taxa
	return -1
def mask_monophyletic_tips(curroot,ignore=[]):
	going = True
	while going and curroot != None and len(curroot.leaves()) >= 4:
		going = False
		for node in curroot.iternodes(): # walk through nodes
			if not node.istip: continue # only look at tips
			name = get_name(node.label).split("_")[1]	
			for sister in node.get_sisters():
				if sister.istip and name==get_name(sister.label).split("_")[1]: # mask
					node = sister.prune()
					if len(curroot.leaves()) >= 4:
						if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1):
							node,curroot = remove_kink(node,curroot)
					going = True
					break
	return curroot
def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"):
    """
	Read final homolog
	write individual alignment files for each ortholog
	Shorten seq id to taxon id
	"""
    if alndir[-1] != "/": alndir += "/"
    if tredir[-1] != "/": tredir += "/"
    if outdir[-1] != "/": outdir += "/"
    filecount = 0
    for i in os.listdir(tredir):
        if i.endswith(ortho_tree_file_ending):
            filecount += 1
            print i
            #read in the alignment into an dictionary
            seqDICT = {}  #key is seqID, value is seq
            for s in read_fasta_file(alndir + i.split(".")[0] +
                                     ".fa.mafft.aln"):
                seqDICT[s.name] = s.seq

            #read in tree tips and write output alignment
            with open(tredir + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            labels = tree_utils.get_front_labels(intree)
            with open(outdir + i.replace(ortho_tree_file_ending, ".aln"),
                      "w") as outfile:
                for lab in labels:
                    outfile.write(">" + tree_utils.get_name(lab) + "\n" +
                                  seqDICT[lab] + "\n")
    assert filecount > 0,\
     "No file ends with "+ortho_tree_file_ending+" was found in "+tredir
def mask_paraphyletic_tips(curroot,ignore=[]):
	going = True
	while going and curroot != None and len(curroot.leaves()) >= 4:
		going = False
		for node in curroot.iternodes(): #walk through nodes
			if not node.istip: continue #only look at tips
			name = get_name(node.label).split("_")[1]
			parent = node.parent
			if node == curroot or parent == curroot or parent == None:
				continue #no paraphyletic tips for the root
			for para in parent.get_sisters():
				if para.istip and name==get_name(para.label).split("_")[1]: # mask
					node = para.prune()	
					if len(curroot.leaves()) >= 4:
						if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1):
							node,curroot = remove_kink(node,curroot)
					going = True
					break
	return curroot
def mask_monophyletic_tips(curroot, unamb_chrDICT):
    going = True
    while going and len(curroot.leaves()) >= 4:
        going = False
        for node in curroot.iternodes():  #walk through nodes
            if not node.istip: continue  #only look at tips
            for sister in node.get_sisters():
                if sister.istip and get_name(node.label) == get_name(
                        sister.label):  #masking
                    #print node.label,unamb_chrDICT[node.label],sister.label,unamb_chrDICT[sister.label]
                    if unamb_chrDICT[node.label] > unamb_chrDICT[sister.label]:
                        node = sister.prune()
                    else:
                        node = node.prune()
                    if len(curroot.leaves()) >= 4:
                        if (node == curroot and node.nchildren == 2) or (
                                node != curroot and node.nchildren == 1):
                            node, curroot = remove_kink(node, curroot)
                    going = True
                    break
    return curroot
def mask_paraphyletic_tips(curroot, unamb_chrDICT):
    going = True
    while going and len(curroot.leaves()) >= 4:
        going = False
        for node in curroot.iternodes():  #walk through nodes
            if not node.istip: continue  #only look at tips
            parent = node.parent
            if node == curroot or parent == curroot:
                continue  #no paraphyletic tips for the root
            for para in parent.get_sisters():
                if para.istip and get_name(node.label) == get_name(para.label):
                    if unamb_chrDICT[node.label] > unamb_chrDICT[para.label]:
                        node = para.prune()
                    else:
                        node = node.prune()
                    if len(curroot.leaves()) >= 4:
                        if (node == curroot and node.nchildren == 2) or (
                                node != curroot and node.nchildren == 1):
                            node, curroot = remove_kink(node, curroot)
                    going = True
                    break
    return curroot
def mcl_to_fasta(all_fasta,mcl_outfile,minimal_taxa,outdir):
    #print "Reading mcl output file"
    clusterDICT = {} #key is seqID, value is clusterID
    minimal_taxa = int(minimal_taxa)
    count = 0
    with open(mcl_outfile,"r") as infile:
        for line in infile:
            if len(line) < 3: continue #ignore empty lines
            spls = line.strip().split('\t')
            if len(set(get_name(i) for i in spls)) >= minimal_taxa:
                count += 1
                for seqID in spls:
                    clusterDICT[seqID] = str(count)
                    
    #print "Reading the fasta file",all_fasta
    for s in read_fasta_file(all_fasta):
        try:
            clusterID = clusterDICT[s.name]
            with open(outdir+"cluster"+clusterID+".fa","a") as outfile:
                outfile.write(">"+s.name+"\n"+s.seq+"\n")
        except: pass # Seqs that did not go in a cluster with enough taxa
def concatenate(clnDIR,numofsitesFilter,numoftaxaFilter,outname):
	"""filter cleaned alignments and concatenate"""
	if clnDIR[-1] != "/": clnDIR += "/"
	sites_filter = int(numofsitesFilter)
	taxa_filter = int(numoftaxaFilter)
	
	print "Filtering ortholog matrixes"
	selected = [] # list of alignment file names that pass the filters
	for i in os.listdir(clnDIR):
		if i.endswith(".aln-cln"):
			seqlist = read_fasta_file(clnDIR+i)
			num_seq = len(seqlist)
			num_col = len(seqlist[0].seq)
			if num_seq >= taxa_filter and num_col >= sites_filter:
				selected.append(i)
	print len(selected),"matrices passed the filter"

	print "Getting matrix occupancy stats"
	taxon_occupancy = {}
	#key is taxon name, value is [times present in a matrix,total length for this taxon]
	total_aligned_len = 0 #record how long the final concatenated matrix is
	
	cmd = "pxcat"+" -o "+outname+".fa"+" -p "+outname+".model"+" -s "
	for i in selected:
		cmd += clnDIR+i+" "
		seqlist = read_fasta_file(clnDIR+i)
		total_aligned_len += len(seqlist[0].seq)
		for s in seqlist:
			taxonid = get_name(s.name)
			if taxonid not in taxon_occupancy:
				taxon_occupancy[taxonid] = [0,0]
			taxon_occupancy[taxonid][0] += 1
			taxon_occupancy[taxonid][1] += len((s.seq.replace("-","")).replace("?",""))
	cmd += "\n"
	
	total_ortho = len(selected)
	with open(outname+"_taxon_occupancy_stats","w") as outfile:
		outfile.write("taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n")
		sum_char = 0
		for taxon in taxon_occupancy:
			times,chars = taxon_occupancy[taxon][0],taxon_occupancy[taxon][1]
			sum_char += chars
			out = taxon+"\t"+str(times)+"\t"+str(chars)+"\t"
			out += str(times/float(total_ortho))+"\t"+str(chars/float(total_aligned_len))+"\n"
			outfile.write(out)
		total_taxa = len(taxon_occupancy)
		out = "\nSupermatrix dimension "+str(total_taxa)+" taxa, "
		out += str(total_ortho)+" loci and "+str(total_aligned_len)+" aligned columns\n"
		out += "Overall matrix occupancy "+str(sum_char/float(total_taxa*total_aligned_len))+"\n"
		outfile.write(out)

	print "Supermatrix taxon occupancy stats written to",outname+"_taxon_occupancy_stats"
	print "Waiting for concatenation to finish. This may take several minutes..."
	with open(outname+".temp.sh","w") as f: f.write(cmd)
	os.system("bash "+outname+".temp.sh")	
	
	#writes phy file
	cmd_pxs2phy = ["pxs2phy","-o",outname+".phy","-s",outname+".fa"]
	print (" ".join(cmd_pxs2phy))
	os.system(" ".join(cmd_pxs2phy))
	
	#writes nex file
	cmd_pxs2nex = ["pxs2nex","-o",outname+".nex","-s",outname+".fa"]
	print (" ".join(cmd_pxs2nex))
	os.system(" ".join(cmd_pxs2nex))

	
	assert os.path.exists(outname+".phy") and os.path.exists(outname+".nex") and os.path.exists(outname+".fa"),  "error concatenate"
	os.system("rm "+outname+".temp.sh")
	print "outfiles written",outname+".phy",outname+".model"
Example #10
0
def concatenate(clnDIR, numofsitesFilter, numoftaxaFilter, seqtype, outname):
    """filter cleaned alignments and concatenate"""
    if clnDIR[-1] != "/": clnDIR += "/"
    sites_filter = int(numofsitesFilter)
    taxa_filter = int(numoftaxaFilter)
    assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa"
    model = "AUTO" if seqtype == "aa" else "DNA"

    print "Filtering ortholog matrixes"
    selected = []  # list of alignment file names that pass the filters
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            seqlist = read_fasta_file(clnDIR + i)
            num_seq = len(seqlist)
            num_col = len(seqlist[0].seq)
            if num_seq >= taxa_filter and num_col >= sites_filter:
                selected.append(i)
    print len(selected), "matrices passed the filter"

    print "Getting matrix occupancy stats"
    taxon_occupancy = {}
    #key is taxon name, value is [times present in a matrix,total length for this taxon]
    total_aligned_len = 0  #record how long the final concatenated matrix is
    if seqtype == "aa":
        cmd = "phyutility -concat -aa -out " + outname + ".nex -in "
    else:
        cmd = "phyutility -concat -out " + outname + ".nex -in "
    for i in selected:
        cmd += clnDIR + i + " "
        seqlist = read_fasta_file(clnDIR + i)
        total_aligned_len += len(seqlist[0].seq)
        for s in seqlist:
            taxonid = get_name(s.name)
            if taxonid not in taxon_occupancy:
                taxon_occupancy[taxonid] = [0, 0]
            taxon_occupancy[taxonid][0] += 1
            taxon_occupancy[taxonid][1] += len(
                (s.seq.replace("-", "")).replace("?", ""))
    cmd += "\n"

    total_ortho = len(selected)
    with open(outname + "_taxon_occupancy_stats", "w") as outfile:
        outfile.write(
            "taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n"
        )
        sum_char = 0
        for taxon in taxon_occupancy:
            times, chars = taxon_occupancy[taxon][0], taxon_occupancy[taxon][1]
            sum_char += chars
            out = taxon + "\t" + str(times) + "\t" + str(chars) + "\t"
            out += str(times / float(total_ortho)) + "\t" + str(
                chars / float(total_aligned_len)) + "\n"
            outfile.write(out)
        total_taxa = len(taxon_occupancy)
        out = "\nSupermatrix dimension " + str(total_taxa) + " taxa, "
        out += str(total_ortho) + " loci and " + str(
            total_aligned_len) + " aligned columns\n"
        out += "Overall matrix occupancy " + str(
            sum_char / float(total_taxa * total_aligned_len)) + "\n"
        outfile.write(out)

    print "Supermatrix taxon occupancy stats written to", outname + "_taxon_occupancy_stats"
    print "Waiting for concatenation to finish. This may take several minutes..."
    with open(outname + ".temp.sh", "w") as f:
        f.write(cmd)
    os.system(cmd)

    #convert the .nex file to .fasta and .model files for raxml
    infile = open(outname + ".nex", "r")
    outfile = open(outname + ".phy", "w")
    for line in infile:
        line = line.strip()
        if len(line) < 10: continue
        if line[0] == "#" or line[:
                                  5] == "BEGIN" or line[:
                                                        6] == "MATRIX" or line == "END;" or line[:
                                                                                                 6] == "FORMAT":
            continue
        if line[0] == "[":
            line = line.replace("[", model + ",")
            line = line.replace(" ]", "")
            line = line.replace(" cluster", "\n" + model + ",cluster")
            #line = line.replace(" homolog","\n"+model+",homolog")
            line = line.replace(" ", "=")
            with open(outname + ".model", "w") as outfile2:
                outfile2.write(line.strip() + "\n")
                #make sure that wc -l will get how many partitions
        elif line[:10] == "DIMENSIONS":
            ntax = (line.split("NTAX=")[1]).split(" ")[0]
            nchar = (line.split("NCHAR=")[1]).replace(";", "")
            outfile.write(ntax + " " + nchar + "\n")
        else:
            spls = line.split("\t")
            outfile.write(spls[0] + " " + spls[1] + "\n")
    infile.close()
    outfile.close()
    assert os.path.exists(outname + ".phy"), "error concatenate"
    os.system("rm " + outname + ".temp.sh")
    print "outfiles written", outname + ".phy", outname + ".model"
    os.system("rm " + outname + ".nex")  #remove intermediate .nex file
Example #11
0
def main():
    args = get_args()
    print args
    indir, outdir, num_cores = args.indir, args.outdir, args.threads
    relative_cut, absolute_cut, branch_len_cutoff = args.reltip, args.abstip, args.deep
    assert args.ortho == "RT" or args.ortho == "121", "--ortho has to be 121 or RT"
    assert os.path.exists(
        args.inout), "cannot find the file specified by --inout"
    assert args.test == "y" or args.test == "n", "test has to be either y or n"
    test = True if args.test == "y" else False
    if outdir[-1] != "/": outdir += "/"
    if indir[-1] != "/": indir += "/"

    logfile = outdir + "homology_inference.log"
    check_dependencies()

    # get initial fasta
    fasta_files = gether_fasta_files(path=indir,
                                     file_ending=".cds.fa",
                                     logfile=logfile)
    print len(fasta_files), "data sets read"
    taxa = set([tree_utils.get_name(i.split(".")[0]) for i in fasta_files])
    print len(taxa), "taxa found:"
    print taxa
    min_taxa = get_min_taxa(len(taxa), args.max_mis_taxa)
    print "Minimal number of taxa: ", min_taxa
    outdir1 = reduce_redundancy(indir, fasta_files, num_cores, outdir, logfile)
    outdir2, outdir3 = clustering(outdir1, fasta_files, num_cores, outdir,
                                  min_taxa, logfile)

    # tree inference and clearning round 1.
    if not os.path.exists(outdir + "3_clusters_ok"):
        refine(curdir=outdir3,nextdir=outdir+"4_refine/",\
          prefix="3_clusters",\
          num_cores=num_cores,\
          relative_cut=relative_cut,\
          absolute_cut=absolute_cut,\
          branch_len_cutoff=branch_len_cutoff,\
          min_taxa=min_taxa,\
          mask_para="n",\
          logfile=logfile,\
          test=test)
        os.system("touch " + outdir + "3_clusters_ok")

    # round 2
    if not os.path.exists(outdir + "4_refine_fasta_ok"):
        write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\
         treDIR=outdir+"4_refine/",\
         tree_file_ending=".subtree",\
         outDIR=outdir+"4_refine/")
        os.system("touch " + outdir + "4_refine_fasta_ok")
    homodir = outdir + "5_homolog/"
    if not os.path.exists(outdir + "4_refine_ok"):
        refine(curdir=outdir+"4_refine/",nextdir=homodir,\
          prefix="5_homolog",\
          num_cores=num_cores,\
          relative_cut=relative_cut,\
          absolute_cut=absolute_cut,\
          branch_len_cutoff=branch_len_cutoff,\
          min_taxa=min_taxa,\
          mask_para="y",\
          logfile=logfile)
        os.system("touch " + outdir + "4_refine_ok")

    # bootstrap the homologs
    if not os.path.exists(outdir + "5_homolog_fasta_ok"):
        # tree files that ends with ".subtree"
        write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\
         treDIR=homodir,\
         tree_file_ending=".subtree",\
         outDIR=homodir)
        """
		# tree files that ends with ".mm"
		write_fasta_files_from_trees.main(fasta=outdir2+"all.fa.cut",\
			treDIR=homodir,\
			tree_file_ending=".mm",\
			outDIR=homodir)"""
        os.system("touch " + outdir + "5_homolog_fasta_ok")
    if not os.path.exists(outdir + "5_homolog_bootstrap_ok"):
        for i in os.listdir(homodir):
            if i.endswith(".fa"):
                fasta_to_tree.fasta_to_bs_tree(DIR=homodir,\
                  fasta=i,\
                  num_cores=num_cores,\
                  seqtype="dna")
        os.system("touch " + outdir + "5_homolog_bootstrap_ok")
    print "homologs with bootstrap values (.tre) written to " + homodir

    # get orthologs
    orthodir = outdir + "6_ortho/"
    try:
        os.stat(orthodir)
    except:
        os.mkdir(orthodir)
    if not os.path.exists(outdir + "ortho_tre_ok"):
        if args.ortho == "121":
            get_121(indir=homodir,\
             tree_file_ending=".tre",\
             min_taxa=min_taxa,\
             outdir=orthodir,\
             min_bootstrap=80.0)
            os.system("touch " + outdir + "ortho_tre_ok")
            with open(logfile, "a") as f:
                f.write(
                    "Filter one-to-one orthologs by minimal bootstrap of 80\n")
        else:
            ingroups = 0
            with open(args.inout) as infile:
                for line in infile:
                    if line.startswith("IN\t"): ingroups += 1
            RT(homoDIR=homodir,\
             tree_file_eneding =".tre",\
             outDIR=orthodir,\
             MIN_INGROUP_TAXA=get_min_taxa(ingroups,args.max_mis_taxa),\
             taxon_code_file=args.inout)
            os.system("touch " + outdir + "ortho_tre_ok")
            with open(logfile, "a") as f:
                f.write("RT orthologs\n")
    if not os.path.exists(outdir + "ortho_aln_ok"):
        ortho_to_aln(alndir=homodir,\
         tredir=orthodir,\
         outdir=orthodir,\
         ortho_tree_file_ending=".tre")
        os.system("touch " + outdir + "ortho_aln_ok")
        with open(logfile, "a") as f:
            f.write("Write ortholog alignments from homologs\n")
    min_col_occup = 0.5 if min_taxa < 10 else 0.3
    if not os.path.exists(outdir + "ortho_cln_ok"):
        phyutility_wrapper.main(DIR=orthodir,\
         min_col_occup=min_col_occup,\
         seqtype="dna")
        os.system("touch " + outdir + "ortho_cln_ok")
        with open(logfile, "a") as f:
            f.write("Trim ortholog alignments by " + str(min_col_occup) + "\n")

    # tree inference and jackknife
    # concatenate
    if args.ortho == "121":
        outname = "filter300-" + str(min_taxa)
        if not os.path.exists(outdir + outname + ".phy"):
            concatenate(clnDIR=orthodir,\
             numofsitesFilter=300,\
             numoftaxaFilter=min_taxa,\
             seqtype="dna",\
             outname=outname)
    else:
        min_ingroup_taxa = get_min_taxa(ingroups, args.max_mis_taxa)
        outname = "filter300-" + str(min_ingroup_taxa)
        if not os.path.exists(outdir + outname + ".phy"):
            concatenate(clnDIR=orthodir,\
             numofsitesFilter=300,\
             numoftaxaFilter=min_ingroup_taxa,\
             seqtype="dna",\
             outname=outname)
    # tree
    if not os.path.exists("RAxML_bestTree." + outname):
        cmd = ["raxml",\
            "-T",str(num_cores),\
            "-p","1234",\
            "-m","GTRCAT",\
            "-q",outname+".model",\
            "-s",outname+".phy",\
            "-n",outname]
        print " ".join(cmd)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out = p.communicate()
        assert p.returncode == 0, "Error raxml" + out[0]
        try:
            os.remove("RAxML_info." + outname)
            os.remove("RAxML_log." + outname)
            os.remove("RAxML_parsimonyTree." + outname)
            os.remove("RAxML_result." + outname)
        except:
            pass  # no need to worry about extra intermediate files

    # jackknife
    if not os.path.exists("JK5_trees"):
        jk(indir="./",\
           num_core=num_cores,\
           resample_num=5,\
           seqtype="dna",\
           replicates=200)

    # map jackknife support to species tree
    cmd = ["raxml", "-f","b",\
        "-t", "RAxML_bestTree."+outname,\
        "-z", "JK5_trees",
        "-T", str(num_cores),\
        "-m", "GTRCAT",\
        "-n", outname+"_JK5"]
    print " ".join(cmd)
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    out = p.communicate()
    assert p.returncode == 0, "Error raxml" + out[0]
    try:
        os.remove("RAxML_info." + outname + "_JK5")
        os.remove("RAxML_log." + outname + "_JK5")
        os.remove("RAxML_parsimonyTree." + outname + "_JK5")
        os.remove("RAxML_result." + outname + "_JK5")
        os.remove("RAxML_bipartitionsBranchLabels." + outname + "_JK5")
    except:
        pass  # no need to worry about extra intermediate files

    # remove intermediate files
    os.system("rm " + outdir + "*_ok")
    os.remove(outdir + "phyutility.log")