Example #1
0
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option("--erate", default=.02, type="float",
                 help="Base error rate of the read [default: %default]")
    p.add_option("--distance", default=500, type="int",
                 help="Outer distance between the two ends [default: %default]")
    p.add_option("--genomesize", type="int",
                 help="Genome size in Mb [default: estimate from data]")
    p.add_option("--readlen", default=100, type="int",
                 help="Length of the read [default: %default]")
    p.add_option("--noerrors", default=False, action="store_true",
                 help="Simulate reads with no errors [default: %default]")
    p.set_depth(depth=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    pf = fastafile.split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = size * depth / (2 * readlen)

    distance = opts.distance
    stdev = distance / 5

    outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth)
    distance -= 2 * readlen  # Outer distance => Inner distance
    assert distance >= 0, "Outer distance must be >= 2 * readlen"

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)
Example #2
0
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option("--erate", default=.02, type="float",
                 help="Base error rate of the read [default: %default]")
    p.add_option("--distance", default=500, type="int",
                 help="Outer distance between the two ends [default: %default]")
    p.add_option("--genomesize", type="int",
                 help="Genome size in Mb [default: estimate from data]")
    p.add_option("--readlen", default=100, type="int",
                 help="Length of the read [default: %default]")
    p.add_option("--noerrors", default=False, action="store_true",
                 help="Simulate reads with no errors [default: %default]")
    p.set_depth(depth=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    pf = fastafile.split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = size * depth / (2 * readlen)

    distance = opts.distance
    stdev = distance / 5

    outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth)
    distance -= 2 * readlen  # Outer distance => Inner distance
    assert distance >= 0, "Outer distance must be >= 2 * readlen"

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)
Example #3
0
File: ca.py Project: arvin580/jcvi
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]")
    p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)

    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1.0 / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile
Example #4
0
File: tgbs.py Project: fw1121/jcvi
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology", choices=("illumina", "454", "iontorrent"),
                 default="iontorrent", help="Sequencing platform")
    p.add_option("--dedup", choices=("uclust", "cdhit"),
                 default="cdhit", help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit", default="/usr/local/bin/")
    p.set_home("fiona", default="/usr/local/bin/")
    p.set_home("jellyfish", default="/usr/local/bin/")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([diginormfile, "--prefix={0}".format(pf),
                    "--cpus={0}".format(cpus),
                    "--jellyfish_home={0}".format(opts.jellyfish_home)])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([fiona, "--consensus", "--reads",
                         "--pctid={0}".format(pctid),
                         "--cdhit_home={0}".format(opts.cdhit_home)])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([cons, "--outfile={0}".format(covfile),
                      "--minsize={0}".format(depth / 5)])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([filteredfile, finalfile, "--sequential=replace",
                    "--prefix={0}_".format(pf)])
Example #5
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])
Example #6
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Example #7
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=/root/khmer/python
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split

    p = OptionParser(diginorm.__doc__)
    p.set_depth()
    p.set_home("khmer")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    hashfile = pf + ".kh"
    keepfile = fastq + ".keep"
    norm_cmd = op.join(kh, "scripts/normalize-by-median.py")
    filt_cmd = op.join(kh, "scripts/filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x 2.5e8 -p".format(depth)
        cmd += " --savehash {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    seckeepfile = abundfiltfile + ".keep"
    if need_update(abundfiltfile, seckeepfile):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x 1e8".format(depth - 5)
        cmd += " {0}".format(abundfiltfile)
        sh(cmd)

    pairsfile = pairinplace([seckeepfile,
                            "--base={0}".format(pf + "_norm"), "--rclip=2"])
    split([pairsfile])
Example #8
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Example #9
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])
Example #10
0
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology",
                 choices=("illumina", "454", "iontorrent"),
                 default="iontorrent",
                 help="Sequencing platform")
    p.add_option("--dedup",
                 choices=("uclust", "cdhit"),
                 default="cdhit",
                 help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit")
    p.set_home("fiona")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([
            diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus)
        ])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "bin/fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([
                fiona, "--consensus", "--reads", "--pctid={0}".format(pctid),
                "--cdhit_home={0}".format(opts.cdhit_home)
            ])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([
            cons, "--outfile={0}".format(covfile),
            "--minsize={0}".format(depth / 5)
        ])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([
            filteredfile, finalfile, "--sequential=replace",
            "--prefix={0}_".format(pf)
        ])
Example #11
0
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen",
                 default=1000,
                 type="int",
                 help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen",
                 default=0,
                 type="int",
                 help="Ignore contig sequence less than [default: %default]")
    p.add_option(
        "--shift",
        default=50,
        type="int",
        help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)
    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1. / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID,
                                                      begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile