Beispiel #1
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=/root/khmer/python
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split

    p = OptionParser(diginorm.__doc__)
    p.set_depth()
    p.set_home("khmer")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    hashfile = pf + ".kh"
    keepfile = fastq + ".keep"
    norm_cmd = op.join(kh, "scripts/normalize-by-median.py")
    filt_cmd = op.join(kh, "scripts/filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x 2.5e8 -p".format(depth)
        cmd += " --savehash {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    seckeepfile = abundfiltfile + ".keep"
    if need_update(abundfiltfile, seckeepfile):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x 1e8".format(depth - 5)
        cmd += " {0}".format(abundfiltfile)
        sh(cmd)

    pairsfile = pairinplace([seckeepfile,
                            "--base={0}".format(pf + "_norm"), "--rclip=2"])
    split([pairsfile])
Beispiel #2
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])
Beispiel #3
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])