Beispiel #1
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + [
        "--seqtk",
        "--outdir={0}".format(opts.outdir),
        "--outfile={0}".format(prefix + ".fasta"),
    ])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid,
                         cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile,
                  userfile,
                  notmatchedfile,
                  clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Beispiel #2
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option("--fast", default=False, action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus", default=False, action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Beispiel #3
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + ["--seqtk",
                                "--outdir={0}".format(opts.outdir),
                                "--outfile={0}".format(prefix + ".fasta")])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile, userfile, notmatchedfile, clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Beispiel #4
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import plt, markup, human_formatter, \
                human_base_formatter, savefig, set2, set_ticklabels_helvetica

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000, bins=100, xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1", default="Counts",
                 help="Label of y-axis on the left")
    p.add_option("--color", default='0', choices=[str(x) for x in range(8)],
                 help="Color of bars, which is an index 0-7 in brewer set2")
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * .5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000 ** SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1. / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, '-', color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = .95, .95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print >> sys.stderr, t
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= .05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(axis="x", direction="out", length=3,
                    left=False, right=False, top=False, bottom=True)
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Beispiel #5
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import (
        plt,
        markup,
        human_formatter,
        human_base_formatter,
        savefig,
        set2,
        set_ticklabels_helvetica,
    )

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000,
                    bins=100,
                    xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1",
                 default="Counts",
                 help="Label of y-axis on the left")
    p.add_option(
        "--color",
        default="0",
        choices=[str(x) for x in range(8)],
        help="Color of bars, which is an index 0-7 in brewer set2",
    )
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * 0.5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000**SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1.0 / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, "-", color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = 0.95, 0.95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print(t, file=sys.stderr)
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= 0.05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(
        axis="x",
        direction="out",
        length=3,
        left=False,
        right=False,
        top=False,
        bottom=True,
    )
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Beispiel #6
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Beispiel #7
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Beispiel #8
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option(
        "--fast",
        default=False,
        action="store_true",
        help="Place sequence in the first cluster",
    )
    p.add_option(
        "--consensus",
        default=False,
        action="store_true",
        help="Compute consensus sequences",
    )
    p.add_option(
        "--reads",
        default=False,
        action="store_true",
        help="Use `cd-hit-454` to deduplicate",
    )
    p.add_option(
        "--samestrand",
        default=False,
        action="store_true",
        help="Enforce same strand alignment",
    )
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    identity = opts.pctid / 100.0
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.0)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".format(
            clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd