def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option( "--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print(fields, file=output_h) work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr) if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join( str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def build(args): """ %prog build [prot.fasta] cds.fasta [options] --outdir=outdir This function wraps on the following steps: 1. msa using ClustalW2 or MUSCLE(default) 2. (optional) alignment editing using Gblocks 3. build NJ tree using PHYLIP in EMBOSS package seq names should be unique by first 10 chars (restriction of PHYLIP) 4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml, *WARNING* maybe slow with large dataset If an outgroup file is provided, the result tree will be rooted on the outgroup according to order in the file, i.e. the name in row1 will be tried first. If not found, row2 will be used, etc. Tail truncated names can be provided so long as it is unique among the seqs. If not uniq, the first occurrence will be used. For example, if you have two moss sequences in your input, then the tree will be rooted on the first moss sequence encountered by the program, unless they are monophylic, in which case the root will be their common ancestor. --stree and --smap are required if --treefix is set. Trees can be edited again using an editor such as Dendroscope. This is the recommended way to get highly customized trees. Newick format trees will be deposited into outdir (. by default). """ from jcvi.formats.fasta import translate p = OptionParser(build.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option("--nogblocks", action="store_true", help="don't use Gblocks to edit alignment [default: %default]") p.add_option("--synonymous", action="store_true", help="extract synonymous sites of the alignment [default: %default]") p.add_option("--fourfold", action="store_true", help="extract fourfold degenerate sites of the alignment [default: %default]") p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--noneighbor", action="store_true", help="don't build NJ tree [default: %default]") p.add_option("--ml", default=None, choices=("raxml", "phyml"), help="software used to build ML tree [default: %default]") p.add_option("--outgroup", help="path to file containing outgroup orders [default: %default]") p.add_option("--SH", help="path to reference Newick tree [default: %default]") p.add_option("--shout", default="SH_out.txt", \ help="SH output file name [default: %default]") p.add_option("--treefix", action="store_true", help="use TreeFix to rearrange ML tree [default: %default]") p.add_option("--stree", help="path to species Newick tree [default: %default]") p.add_option("--smap", help="path to smap file: " \ "gene_name_pattern<tab>species_name [default: %default]") p.set_outdir() opts, args = p.parse_args(args) gblocks = not opts.nogblocks synonymous = opts.synonymous fourfold = opts.fourfold neighbor = not opts.noneighbor outgroup = opts.outgroup outdir = opts.outdir if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) if opts.treefix: stree = opts.stree smap = opts.smap assert stree and smap, "TreeFix requires stree and smap files." opts.ml = "raxml" treedir = op.join(outdir, "tree") mkdir(treedir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) work_dir = op.join(outdir, "alignment") mkdir(work_dir) p_recs = list(SeqIO.parse(open(protein_file), "fasta")) if opts.msa == "clustalw": align_fasta = clustal_align_protein(p_recs, work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein(p_recs, work_dir) n_recs = list(SeqIO.parse(open(dna_file), "fasta")) mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta") if not mrtrans_fasta: logging.debug("pal2nal aborted. " \ "Cannot reliably build tree for {0}".format(dna_file)) return codon_aln_fasta = mrtrans_fasta if gblocks: gb_fasta = run_gblocks(mrtrans_fasta) codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta else: if synonymous: codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous") if fourfold: codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold") if not neighbor and not opts.ml: return codon_aln_fasta alignment = AlignIO.read(codon_aln_fasta, "fasta") if len(alignment) <= 3: raise ValueError("Too few seqs to build tree.") mkdir(op.join(treedir, "work")) if neighbor: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".NJ.unrooted.dnd") try: outfile, phy_file = build_nj_phylip(alignment, \ outfile=out_file, outgroup=outgroup, work_dir=treedir) except: print("NJ tree cannot be built for {0}".format(dna_file)) if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.ml: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".ML.unrooted.dnd") if opts.ml == "phyml": try: outfile, phy_file = build_ml_phyml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) elif opts.ml == "raxml": try: outfile, phy_file = build_ml_raxml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) if outgroup: new_out_file = out_file.replace(".unrooted", "") t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \ outfile=new_out_file) if t == new_out_file: sh("rm %s" % out_file) outfile = new_out_file if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.treefix: treefix_dir = op.join(treedir, "treefix") assert mkdir(treefix_dir, overwrite=True) sh("cp {0} {1}/".format(outfile, treefix_dir)) input = op.join(treefix_dir, op.basename(outfile)) aln_file = input.rsplit(".", 1)[0] + ".fasta" SeqIO.write(alignment, aln_file, "fasta") outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \ a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd") return outfile
def build(args): """ %prog build [prot.fasta] cds.fasta [options] --outdir=outdir This function wraps on the following steps: 1. msa using ClustalW2 or MUSCLE(default) 2. (optional) alignment editing using Gblocks 3. build NJ tree using PHYLIP in EMBOSS package seq names should be unique by first 10 chars (restriction of PHYLIP) 4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml, *WARNING* maybe slow with large dataset If an outgroup file is provided, the result tree will be rooted on the outgroup according to order in the file, i.e. the name in row1 will be tried first. If not found, row2 will be used, etc. Tail truncated names can be provided so long as it is unique among the seqs. If not uniq, the first occurrence will be used. For example, if you have two moss sequences in your input, then the tree will be rooted on the first moss sequence encountered by the program, unless they are monophylic, in which case the root will be their common ancestor. --stree and --smap are required if --treefix is set. Trees can be edited again using an editor such as Dendroscope. This is the recommended way to get highly customized trees. Newick format trees will be deposited into outdir (. by default). """ from jcvi.formats.fasta import translate p = OptionParser(build.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option( "--nogblocks", action="store_true", help="don't use Gblocks to edit alignment [default: %default]") p.add_option( "--synonymous", action="store_true", help="extract synonymous sites of the alignment [default: %default]") p.add_option( "--fourfold", action="store_true", help= "extract fourfold degenerate sites of the alignment [default: %default]" ) p.add_option( "--msa", default="muscle", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--noneighbor", action="store_true", help="don't build NJ tree [default: %default]") p.add_option("--ml", default=None, choices=("raxml", "phyml"), help="software used to build ML tree [default: %default]") p.add_option( "--outgroup", help="path to file containing outgroup orders [default: %default]") p.add_option("--SH", help="path to reference Newick tree [default: %default]") p.add_option("--shout", default="SH_out.txt", \ help="SH output file name [default: %default]") p.add_option("--treefix", action="store_true", help="use TreeFix to rearrange ML tree [default: %default]") p.add_option("--stree", help="path to species Newick tree [default: %default]") p.add_option("--smap", help="path to smap file: " \ "gene_name_pattern<tab>species_name [default: %default]") p.add_option("--outdir", type="string", default=".", \ help="path to output dir. New dir is made if not existing [default: %default]") opts, args = p.parse_args(args) gblocks = not opts.nogblocks synonymous = opts.synonymous fourfold = opts.fourfold neighbor = not opts.noneighbor outgroup = opts.outgroup outdir = opts.outdir if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >> sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) if opts.treefix: stree = opts.stree smap = opts.smap assert stree and smap, "TreeFix requires stree and smap files." opts.ml = "raxml" treedir = op.join(outdir, "tree") mkdir(treedir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) work_dir = op.join(outdir, "alignment") mkdir(work_dir) p_recs = list(SeqIO.parse(open(protein_file), "fasta")) if opts.msa == "clustalw": align_fasta = clustal_align_protein(p_recs, work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein(p_recs, work_dir) n_recs = list(SeqIO.parse(open(dna_file), "fasta")) mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta") if not mrtrans_fasta: logging.debug("pal2nal aborted. " \ "Cannot reliably build tree for {0}".format(dna_file)) return codon_aln_fasta = mrtrans_fasta if gblocks: gb_fasta = run_gblocks(mrtrans_fasta) codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta else: if synonymous: codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous") if fourfold: codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold") if not neighbor and not opts.ml: return codon_aln_fasta alignment = AlignIO.read(codon_aln_fasta, "fasta") if len(alignment) <= 3: raise ValueError("Too few seqs to build tree.") mkdir(op.join(treedir, "work")) if neighbor: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".NJ.unrooted.dnd") try: outfile, phy_file = build_nj_phylip(alignment, \ outfile=out_file, outgroup=outgroup, work_dir=treedir) except: print "NJ tree cannot be built for {0}".format(dna_file) if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.ml: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".ML.unrooted.dnd") if opts.ml == "phyml": try: outfile, phy_file = build_ml_phyml\ (alignment, outfile=out_file, work_dir=treedir) except: print "ML tree cannot be built for {0}".format(dna_file) elif opts.ml == "raxml": try: outfile, phy_file = build_ml_raxml\ (alignment, outfile=out_file, work_dir=treedir) except: print "ML tree cannot be built for {0}".format(dna_file) if outgroup: new_out_file = out_file.replace(".unrooted", "") t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \ outfile=new_out_file) if t == new_out_file: sh("rm %s" % out_file) outfile = new_out_file if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.treefix: treefix_dir = op.join(treedir, "treefix") assert mkdir(treefix_dir, overwrite=True) sh("cp {0} {1}/".format(outfile, treefix_dir)) input = op.join(treefix_dir, op.basename(outfile)) aln_file = input.rsplit(".", 1)[0] + ".fasta" SeqIO.write(alignment, aln_file, "fasta") outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \ a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd") return outfile
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option("--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >>sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print >> output_h, header work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")