Exemple #1
0
def checkShuffleSizes(p1, p2, pairsfastq, extra=0):
    from jcvi.apps.base import getfilesize

    pairssize = getfilesize(pairsfastq)
    p1size = getfilesize(p1)
    p2size = getfilesize(p2)
    assert pairssize == p1size + p2size + extra, "The sizes do not add up: {0} + {1} + {2} != {3}".format(
        p1size, p2size, extra, pairssize
    )
Exemple #2
0
def checkShuffleSizes(p1, p2, pairsfastq, extra=0):
    from jcvi.apps.base import getfilesize

    pairssize = getfilesize(pairsfastq)
    p1size = getfilesize(p1)
    p2size = getfilesize(p2)
    assert pairssize == p1size + p2size + extra, \
          "The sizes do not add up: {0} + {1} + {2} != {3}".\
          format(p1size, p2size, extra, pairssize)
Exemple #3
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage", default=40, type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix", default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist", default=False, action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Exemple #4
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size

    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size")
    p.add_option(
        "--coverage",
        default=40,
        type="int",
        help="Expected sequence coverage",
    )
    p.add_option("--prefix", default="jf", help="Database prefix")
    p.add_option(
        "--nohist",
        default=False,
        action="store_true",
        help="Do not print histogram",
    )
    p.set_home("jellyfish")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".format(
        human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    jfcmd = op.join(opts.jellyfish_home, "jellyfish")
    cmd = jfcmd
    cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Exemple #5
0
def is_matching_gz(origfile, gzfile):
    if not op.exists(origfile):
        return False
    if not op.exists(gzfile):
        return False
    return getfilesize(origfile) == getfilesize(gzfile)
Exemple #6
0
def is_matching_gz(origfile, gzfile):
    if not op.exists(origfile):
        return False
    if not op.exists(gzfile):
        return False
    return getfilesize(origfile) == getfilesize(gzfile)
Exemple #7
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])
Exemple #8
0
def diginorm(args):
    """
    %prog diginorm fastqfile

    Run K-mer based normalization. Based on tutorial:
    <http://ged.msu.edu/angus/diginorm-2012/tutorial.html>

    Assume input is either an interleaved pairs file, or two separate files.

    To set up khmer:
    $ git clone git://github.com/ged-lab/screed.git
    $ git clone git://github.com/ged-lab/khmer.git
    $ cd screed
    $ python setup.py install
    $ cd ../khmer
    $ make test
    $ export PYTHONPATH=~/export/khmer
    """
    from jcvi.formats.fastq import shuffle, pairinplace, split
    from jcvi.apps.base import getfilesize

    p = OptionParser(diginorm.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end reads")
    p.add_option("--tablesize", help="Memory size")
    p.add_option("--npass", default="1", choices=("1", "2"),
                 help="How many passes of normalization")
    p.set_depth(depth=50)
    p.set_home("khmer", default="/usr/local/bin/")
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    if len(args) == 2:
        fastq = shuffle(args + ["--tag"])
    else:
        fastq, = args

    kh = opts.khmer_home
    depth = opts.depth
    PE = not opts.single
    sys.path.insert(0, op.join(kh, "python"))

    pf = fastq.rsplit(".", 1)[0]
    keepfile = fastq + ".keep"
    hashfile = pf + ".kh"
    mints = 10000000
    ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints)

    norm_cmd = op.join(kh, "normalize-by-median.py")
    filt_cmd = op.join(kh, "filter-abund.py")
    if need_update(fastq, (hashfile, keepfile)):
        cmd = norm_cmd
        cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts)
        if PE:
            cmd += " -p"
        cmd += " -s {0} {1}".format(hashfile, fastq)
        sh(cmd)

    abundfiltfile = keepfile + ".abundfilt"
    if need_update((hashfile, keepfile), abundfiltfile):
        cmd = filt_cmd
        cmd += " {0} {1}".format(hashfile, keepfile)
        sh(cmd)

    if opts.npass == "1":
        seckeepfile = abundfiltfile
    else:
        seckeepfile = abundfiltfile + ".keep"
        if need_update(abundfiltfile, seckeepfile):
            cmd = norm_cmd
            cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2)
            cmd += " {0}".format(abundfiltfile)
            sh(cmd)

    if PE:
        pairsfile = pairinplace([seckeepfile,
                                "--base={0}".format(pf + "_norm"), "--rclip=2"])
        split([pairsfile])