Example #1
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option(
        "--msa",
        default="clustalw",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins [default: %default]")
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print(fields, file=output_h)
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr)
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(
                    str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn,
                                     ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Example #2
0
def build(args):
    """
    %prog build [prot.fasta] cds.fasta [options] --outdir=outdir

    This function wraps on the following steps:
    1. msa using ClustalW2 or MUSCLE(default)
    2. (optional) alignment editing using Gblocks
    3. build NJ tree using PHYLIP in EMBOSS package
       seq names should be unique by first 10 chars (restriction of PHYLIP)
    4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml,
       *WARNING* maybe slow with large dataset

    If an outgroup file is provided, the result tree will be rooted on the
    outgroup according to order in the file, i.e. the name in row1 will be
    tried first. If not found, row2 will be used, etc.
    Tail truncated names can be provided so long as it is unique among the seqs.
    If not uniq, the first occurrence will be used. For example, if you have
    two moss sequences in your input, then the tree will be rooted on the
    first moss sequence encountered by the program, unless they are monophylic,
     in which case the root will be their common ancestor.

    --stree and --smap are required if --treefix is set.

    Trees can be edited again using an editor such as Dendroscope. This
    is the recommended way to get highly customized trees.

    Newick format trees will be deposited into outdir (. by default).
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(build.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option("--nogblocks", action="store_true",
                 help="don't use Gblocks to edit alignment [default: %default]")
    p.add_option("--synonymous", action="store_true",
                 help="extract synonymous sites of the alignment [default: %default]")
    p.add_option("--fourfold", action="store_true",
                 help="extract fourfold degenerate sites of the alignment [default: %default]")
    p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"),
                 help="software used to align the proteins [default: %default]")
    p.add_option("--noneighbor", action="store_true",
                 help="don't build NJ tree [default: %default]")
    p.add_option("--ml", default=None, choices=("raxml", "phyml"),
                 help="software used to build ML tree [default: %default]")
    p.add_option("--outgroup",
                 help="path to file containing outgroup orders [default: %default]")
    p.add_option("--SH", help="path to reference Newick tree [default: %default]")
    p.add_option("--shout", default="SH_out.txt", \
                 help="SH output file name [default: %default]")
    p.add_option("--treefix", action="store_true",
                 help="use TreeFix to rearrange ML tree [default: %default]")
    p.add_option("--stree", help="path to species Newick tree [default: %default]")
    p.add_option("--smap", help="path to smap file: " \
                    "gene_name_pattern<tab>species_name [default: %default]")
    p.set_outdir()

    opts, args = p.parse_args(args)
    gblocks = not opts.nogblocks
    synonymous = opts.synonymous
    fourfold = opts.fourfold
    neighbor = not opts.noneighbor
    outgroup = opts.outgroup
    outdir = opts.outdir

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    if opts.treefix:
        stree = opts.stree
        smap = opts.smap
        assert stree and smap, "TreeFix requires stree and smap files."
        opts.ml = "raxml"

    treedir = op.join(outdir, "tree")
    mkdir(treedir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    work_dir = op.join(outdir, "alignment")
    mkdir(work_dir)
    p_recs = list(SeqIO.parse(open(protein_file), "fasta"))
    if opts.msa == "clustalw":
        align_fasta = clustal_align_protein(p_recs, work_dir)
    elif opts.msa == "muscle":
        align_fasta = muscle_align_protein(p_recs, work_dir)

    n_recs = list(SeqIO.parse(open(dna_file), "fasta"))
    mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta")

    if not mrtrans_fasta:
        logging.debug("pal2nal aborted. " \
            "Cannot reliably build tree for {0}".format(dna_file))
        return

    codon_aln_fasta = mrtrans_fasta
    if gblocks:
        gb_fasta = run_gblocks(mrtrans_fasta)
        codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta

    else:
        if synonymous:
            codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous")

        if fourfold:
            codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold")

    if not neighbor and not opts.ml:
        return codon_aln_fasta

    alignment = AlignIO.read(codon_aln_fasta, "fasta")
    if len(alignment) <= 3:
        raise ValueError("Too few seqs to build tree.")

    mkdir(op.join(treedir, "work"))
    if neighbor:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".NJ.unrooted.dnd")
        try:
            outfile, phy_file = build_nj_phylip(alignment, \
                outfile=out_file, outgroup=outgroup, work_dir=treedir)
        except:
            print("NJ tree cannot be built for {0}".format(dna_file))

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

    if opts.ml:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".ML.unrooted.dnd")

        if opts.ml == "phyml":
            try:
                outfile, phy_file = build_ml_phyml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        elif opts.ml == "raxml":
            try:
                outfile, phy_file = build_ml_raxml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        if outgroup:
            new_out_file = out_file.replace(".unrooted", "")
            t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \
                outfile=new_out_file)
            if t == new_out_file:
                sh("rm %s" % out_file)
                outfile = new_out_file

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

        if opts.treefix:
            treefix_dir = op.join(treedir, "treefix")
            assert mkdir(treefix_dir, overwrite=True)

            sh("cp {0} {1}/".format(outfile, treefix_dir))
            input = op.join(treefix_dir, op.basename(outfile))
            aln_file = input.rsplit(".", 1)[0] + ".fasta"
            SeqIO.write(alignment, aln_file, "fasta")

            outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \
                        a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd")

    return outfile
Example #3
0
def build(args):
    """
    %prog build [prot.fasta] cds.fasta [options] --outdir=outdir

    This function wraps on the following steps:
    1. msa using ClustalW2 or MUSCLE(default)
    2. (optional) alignment editing using Gblocks
    3. build NJ tree using PHYLIP in EMBOSS package
       seq names should be unique by first 10 chars (restriction of PHYLIP)
    4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml,
       *WARNING* maybe slow with large dataset

    If an outgroup file is provided, the result tree will be rooted on the
    outgroup according to order in the file, i.e. the name in row1 will be
    tried first. If not found, row2 will be used, etc.
    Tail truncated names can be provided so long as it is unique among the seqs.
    If not uniq, the first occurrence will be used. For example, if you have
    two moss sequences in your input, then the tree will be rooted on the
    first moss sequence encountered by the program, unless they are monophylic,
     in which case the root will be their common ancestor.

    --stree and --smap are required if --treefix is set.

    Trees can be edited again using an editor such as Dendroscope. This
    is the recommended way to get highly customized trees.

    Newick format trees will be deposited into outdir (. by default).
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(build.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option(
        "--nogblocks",
        action="store_true",
        help="don't use Gblocks to edit alignment [default: %default]")
    p.add_option(
        "--synonymous",
        action="store_true",
        help="extract synonymous sites of the alignment [default: %default]")
    p.add_option(
        "--fourfold",
        action="store_true",
        help=
        "extract fourfold degenerate sites of the alignment [default: %default]"
    )
    p.add_option(
        "--msa",
        default="muscle",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins [default: %default]")
    p.add_option("--noneighbor",
                 action="store_true",
                 help="don't build NJ tree [default: %default]")
    p.add_option("--ml",
                 default=None,
                 choices=("raxml", "phyml"),
                 help="software used to build ML tree [default: %default]")
    p.add_option(
        "--outgroup",
        help="path to file containing outgroup orders [default: %default]")
    p.add_option("--SH",
                 help="path to reference Newick tree [default: %default]")
    p.add_option("--shout", default="SH_out.txt", \
                 help="SH output file name [default: %default]")
    p.add_option("--treefix",
                 action="store_true",
                 help="use TreeFix to rearrange ML tree [default: %default]")
    p.add_option("--stree",
                 help="path to species Newick tree [default: %default]")
    p.add_option("--smap", help="path to smap file: " \
                    "gene_name_pattern<tab>species_name [default: %default]")
    p.add_option("--outdir", type="string", default=".", \
                 help="path to output dir. New dir is made if not existing [default: %default]")

    opts, args = p.parse_args(args)
    gblocks = not opts.nogblocks
    synonymous = opts.synonymous
    fourfold = opts.fourfold
    neighbor = not opts.noneighbor
    outgroup = opts.outgroup
    outdir = opts.outdir

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >> sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    if opts.treefix:
        stree = opts.stree
        smap = opts.smap
        assert stree and smap, "TreeFix requires stree and smap files."
        opts.ml = "raxml"

    treedir = op.join(outdir, "tree")
    mkdir(treedir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    work_dir = op.join(outdir, "alignment")
    mkdir(work_dir)
    p_recs = list(SeqIO.parse(open(protein_file), "fasta"))
    if opts.msa == "clustalw":
        align_fasta = clustal_align_protein(p_recs, work_dir)
    elif opts.msa == "muscle":
        align_fasta = muscle_align_protein(p_recs, work_dir)

    n_recs = list(SeqIO.parse(open(dna_file), "fasta"))
    mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta")

    if not mrtrans_fasta:
        logging.debug("pal2nal aborted. " \
            "Cannot reliably build tree for {0}".format(dna_file))
        return

    codon_aln_fasta = mrtrans_fasta
    if gblocks:
        gb_fasta = run_gblocks(mrtrans_fasta)
        codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta

    else:
        if synonymous:
            codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous")

        if fourfold:
            codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold")

    if not neighbor and not opts.ml:
        return codon_aln_fasta

    alignment = AlignIO.read(codon_aln_fasta, "fasta")
    if len(alignment) <= 3:
        raise ValueError("Too few seqs to build tree.")

    mkdir(op.join(treedir, "work"))
    if neighbor:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".NJ.unrooted.dnd")
        try:
            outfile, phy_file = build_nj_phylip(alignment, \
                outfile=out_file, outgroup=outgroup, work_dir=treedir)
        except:
            print "NJ tree cannot be built for {0}".format(dna_file)

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

    if opts.ml:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".ML.unrooted.dnd")

        if opts.ml == "phyml":
            try:
                outfile, phy_file = build_ml_phyml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print "ML tree cannot be built for {0}".format(dna_file)

        elif opts.ml == "raxml":
            try:
                outfile, phy_file = build_ml_raxml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print "ML tree cannot be built for {0}".format(dna_file)

        if outgroup:
            new_out_file = out_file.replace(".unrooted", "")
            t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \
                outfile=new_out_file)
            if t == new_out_file:
                sh("rm %s" % out_file)
                outfile = new_out_file

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

        if opts.treefix:
            treefix_dir = op.join(treedir, "treefix")
            assert mkdir(treefix_dir, overwrite=True)

            sh("cp {0} {1}/".format(outfile, treefix_dir))
            input = op.join(treefix_dir, op.basename(outfile))
            aln_file = input.rsplit(".", 1)[0] + ".fasta"
            SeqIO.write(alignment, aln_file, "fasta")

            outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \
                        a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd")

    return outfile
Example #4
0
File: ks.py Project: ascendo/jcvi
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option("--msa", default="clustalw", choices=("clustalw", "muscle"),
                 help="software used to align the proteins [default: %default]")
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >>sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print >> output_h, header
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(str(x) for x in (pair_name,
                        ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")