Ejemplo n.º 1
0
def bwasw(args):
    """
    %prog bwasw database.fasta long_read.fastq

    Wrapper for `bwa bwasw`. Output will be long_read.sam.
    """
    p = OptionParser(bwasw.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfile = args
    safile = check_index(dbfile, grid=grid)
    saifile = check_aln(dbfile, readfile, grid=grid)

    samfile = readfile.rsplit(".", 1)[0] + ".sam"
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
        return

    cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile)
    cmd += "{0}".format(extra)
    sh(cmd, grid=grid, outfile=samfile)
Ejemplo n.º 2
0
def txt(args):
    """
    %prog txt casfile

    convert binary CAS file to tabular output using CLC assembly_table
    """
    p = OptionParser(txt.__doc__)
    p.add_option("-m",
                 dest="multi",
                 default=False,
                 action="store_true",
                 help="report multi-matches [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    grid = opts.grid

    casfile, = args
    txtfile = casfile.replace(".cas", ".txt")
    assert op.exists(casfile)

    cmd = "assembly_table -n -s -p "
    if opts.multi:
        cmd += "-m "
    cmd += casfile
    sh(cmd, grid=grid, outfile=txtfile)

    return txtfile
Ejemplo n.º 3
0
def aln(args):
    """
    %prog aln database.fasta *.fastq

    Wrapper for `bwa aln` except this will run over a set of files.
    """
    p = OptionParser(aln.__doc__)
    p.add_option("--cpus",
                 default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfiles = args[0], args[1:]
    safile = check_index(dbfile, grid=grid)
    for readfile in readfiles:
        saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
Ejemplo n.º 4
0
def split(args):
    """
    %prog split casfile 1 10

    split the binary casfile by using CLCbio `sub_assembly` program, the two
    numbers are starting and ending index for the `reference`; useful to split
    one big assembly per contig
    """
    p = OptionParser(split.__doc__)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    casfile, start, end = args
    start = int(start)
    end = int(end)

    split_cmd = "sub_assembly -a {casfile} -o sa.{i}.cas -s {i} " + \
        "-e sa.{i}.pairs.fasta -f sa.{i}.fragments.fasta -g sa.{i}.ref.fasta"

    for i in range(start, end + 1):
        cmd = split_cmd.format(casfile=casfile, i=i)
        sh(cmd, grid=opts.grid)
Ejemplo n.º 5
0
def bwasw(args):
    """
    %prog bwasw database.fasta long_read.fastq

    Wrapper for `bwa bwasw`. Output will be long_read.sam.
    """
    p = OptionParser(bwasw.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfile = args
    safile = check_index(dbfile, grid=grid)
    saifile = check_aln(dbfile, readfile, grid=grid)

    samfile = readfile.rsplit(".", 1)[0] + ".sam"
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
        return

    cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile)
    cmd += "{0}".format(extra)
    sh(cmd, grid=grid, outfile=samfile)
Ejemplo n.º 6
0
Archivo: cas.py Proyecto: bennyyu/jcvi
def txt(args):
    """
    %prog txt casfile

    convert binary CAS file to tabular output using CLC assembly_table
    """
    p = OptionParser(txt.__doc__)
    p.add_option("-m", dest="multi", default=False, action="store_true",
        help="report multi-matches [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    grid = opts.grid

    casfile, = args
    txtfile = casfile.replace(".cas", ".txt")
    assert op.exists(casfile)

    cmd = "assembly_table -n -s -p "
    if opts.multi:
        cmd += "-m "
    cmd += casfile
    sh(cmd, grid=grid, outfile=txtfile)

    return txtfile
Ejemplo n.º 7
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-454` to remove duplicate reads.
    """
    p = OptionParser(deduplicate.__doc__)
    p.add_option("--identity", default=.98, type="float",
                 help="Sequence identity threshold [default: %default]")
    p.add_option("--cpus", default=0, type="int",
                 help="Number of CPUs to use, 0=unlimited [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args

    from jcvi.apps.command import CDPATH

    cmd = CDPATH("cd-hit-454")
    cmd += " -c {0}".format(opts.identity)
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd, grid=opts.grid)
Ejemplo n.º 8
0
Archivo: cas.py Proyecto: bennyyu/jcvi
def split(args):
    """
    %prog split casfile 1 10

    split the binary casfile by using CLCbio `sub_assembly` program, the two
    numbers are starting and ending index for the `reference`; useful to split
    one big assembly per contig
    """
    p = OptionParser(split.__doc__)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    casfile, start, end = args
    start = int(start)
    end = int(end)

    split_cmd = "sub_assembly -a {casfile} -o sa.{i}.cas -s {i} " + \
        "-e sa.{i}.pairs.fasta -f sa.{i}.fragments.fasta -g sa.{i}.ref.fasta"

    for i in range(start, end + 1):
        cmd = split_cmd.format(casfile=casfile, i=i)
        sh(cmd, grid=opts.grid)
Ejemplo n.º 9
0
def trim(args):
    """
    %prog trim fastqfile

    Wraps `fastx_trimmer` to trim from begin or end of reads.
    """
    p = OptionParser(trim.__doc__)
    set_grid(p)

    p.add_option("-f", dest="first", default=0, type="int",
            help="First base to keep. Default is 1.")
    p.add_option("-l", dest="last", default=0, type="int",
            help="Last base to keep. Default is entire read.")

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    grid = opts.grid

    fastqfile, = args
    base = op.basename(fastqfile).split(".")[0]
    fq = base + ".ntrimmed.fastq"

    cmd = "fastx_trimmer -Q33 "
    if opts.first:
        cmd += "-f {0.first} ".format(opts)
    if opts.last:
        cmd += "-l {0.last} ".format(opts)

    sh(cmd, grid=grid, infile=fastqfile, outfile=fq)
Ejemplo n.º 10
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-454` to remove duplicate reads.
    """
    p = OptionParser(deduplicate.__doc__)
    p.add_option("--identity",
                 default=.98,
                 type="float",
                 help="Sequence identity threshold [default: %default]")
    p.add_option("--cpus",
                 default=0,
                 type="int",
                 help="Number of CPUs to use, 0=unlimited [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args

    from jcvi.apps.command import CDPATH

    cmd = CDPATH("cd-hit-454")
    cmd += " -c {0}".format(opts.identity)
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd, grid=opts.grid)
Ejemplo n.º 11
0
Archivo: clc.py Proyecto: bennyyu/jcvi
def map(args):
    """
    %prog map reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(map.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)
    if len(args) < 2:
        sys.exit(not p.print_help())

    license = "license.properties"
    if not op.exists(license):
        sh("cp ~/{0} .".format(license))

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus 16"
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:
        cmd += " -l 0.8 -s 0.98"

    sh(cmd, grid=opts.grid)
Ejemplo n.º 12
0
def map(args):
    """
    %prog map reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(map.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)
    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus 16"
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:
        cmd += " -l 0.8 -s 0.98"

    sh(cmd, grid=opts.grid)
Ejemplo n.º 13
0
Archivo: ca.py Proyecto: bennyyu/jcvi
def sff(args):
    """
    %prog sff sffiles

    Convert reads formatted as 454 SFF file, and convert to CA frg file.
    Turn --nodedup on if another deduplication mechanism is used (e.g.
    CD-HIT-454). See assembly.sff.deduplicate().
    """
    p = OptionParser(sff.__doc__)
    p.add_option("--prefix", dest="prefix", default=None,
            help="Output frg filename prefix")
    p.add_option("--nodedup", default=False, action="store_true",
            help="Do not remove duplicates [default: %default]")
    set_grid(p)
    add_size_option(p)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    grid = opts.grid

    sffiles = args
    plates = [x.split(".")[0].split("_")[-1] for x in sffiles]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if len(plates) > 1:
        plate = plates[0][:-1] + 'X'
    else:
        plate = "_".join(plates)

    if mated:
        libname = "Titan{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "TitanFrags-" + plate

    if opts.prefix:
        libname = opts.prefix

    cmd = CAPATH("sffToCA")
    cmd += " -libraryname {0} -output {0} ".format(libname)
    cmd += " -clear 454 -trim chop "
    if mated:
        cmd += " -linker titanium -insertsize {0} {1} ".format(mean, sv)
    if opts.nodedup:
        cmd += " -nodedup "

    cmd += " ".join(sffiles)

    sh(cmd, grid=grid)
Ejemplo n.º 14
0
def sff(args):
    """
    %prog sff sffiles

    Convert reads formatted as 454 SFF file, and convert to CA frg file.
    Turn --nodedup on if another deduplication mechanism is used (e.g.
    CD-HIT-454). See assembly.sff.deduplicate().
    """
    p = OptionParser(sff.__doc__)
    p.add_option("--prefix", dest="prefix", default=None,
            help="Output frg filename prefix")
    p.add_option("--nodedup", default=False, action="store_true",
            help="Do not remove duplicates [default: %default]")
    set_grid(p)
    add_size_option(p)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    grid = opts.grid

    sffiles = args
    plates = [x.split(".")[0].split("_")[-1] for x in sffiles]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if len(plates) > 1:
        plate = plates[0][:-1] + 'X'
    else:
        plate = "_".join(plates)

    if mated:
        libname = "Titan{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "TitanFrags-" + plate

    if opts.prefix:
        libname = opts.prefix

    cmd = CAPATH("sffToCA")
    cmd += " -libraryname {0} -output {0} ".format(libname)
    cmd += " -clear 454 -trim chop "
    if mated:
        cmd += " -linker titanium -insertsize {0} {1} ".format(mean, sv)
    if opts.nodedup:
        cmd += " -nodedup "

    cmd += " ".join(sffiles)

    sh(cmd, grid=grid)
Ejemplo n.º 15
0
Archivo: ca.py Proyecto: bennyyu/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    p = OptionParser(fasta.__doc__)
    p.add_option("-m", dest="matefile", default=None,
            help="matepairs file")
    set_grid(p)
    add_size_option(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    grid = opts.grid

    fastafile, = args
    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "SangerFrags-" + plate

    frgfile = libname + ".frg"

    qualfile = make_qual(fastafile)
    if mated:
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

    cmd = CAPATH("convert-fasta-to-v2.pl")
    cmd += " -l {0} -s {1} -q {2} ".\
            format(libname, fastafile, qualfile)
    if mated:
        cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

    sh(cmd, grid=grid, outfile=frgfile)
Ejemplo n.º 16
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    p = OptionParser(fasta.__doc__)
    p.add_option("-m", dest="matefile", default=None,
            help="matepairs file")
    set_grid(p)
    add_size_option(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    grid = opts.grid

    fastafile, = args
    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "SangerFrags-" + plate

    frgfile = libname + ".frg"

    qualfile = make_qual(fastafile)
    if mated:
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

    cmd = CAPATH("convert-fasta-to-v2.pl")
    cmd += " -l {0} -s {1} -q {2} ".\
            format(libname, fastafile, qualfile)
    if mated:
        cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

    sh(cmd, grid=grid, outfile=frgfile)
Ejemplo n.º 17
0
def trim(args):
    """
    %prog trim fastqfiles

    Use `quality_trim` to trim fastq files. If there are two fastqfiles
    inputted, it is assumed as pairs of fastqs.
    """
    p = OptionParser(trim.__doc__)

    # There are many more options from `quality_trim`, but most useful twos are
    # quality cutoff (-c) and length cutoff (-m)
    p.add_option("-c", "--cutoff", dest="cutoff", type="int", default=20,
            help="Set the minimum quality for a good nucleotide. " +\
                 "[default: %default]")
    p.add_option("-m", "--minlength", dest="minlength", type="int", default=30,
            help="Set the minimum length of output reads. " +\
                 "[default: %default]")
    p.add_option("--offset", dest="offset", type="int", default=64,
            help="Set the ascii offset value in fastq [default: %default]")
    p.add_option("--fasta", dest="fasta", default=False, action="store_true",
            help="Output fasta sequence? [default: fastq]")
    set_grid(p)

    opts, args = p.parse_args(args)

    largs = len(args)
    if largs not in (1, 2):
        sys.exit(p.print_help())

    paired = (largs == 2)
    fastqfile1 = args[0]
    assert op.exists(fastqfile1)

    suffix = "fasta" if opts.fasta else "fastq"

    if paired:
        fastqfile2 = args[1]
        assert op.exists(fastqfile2)

    prefix = fastqfile1.split('.')[0]
    cmd = "quality_trim -c {0.cutoff} -m {0.minlength} -f {0.offset} ".format(opts)
    if paired:
        cmd += "-r -i {0} {1} ".format(fastqfile1, fastqfile2)
        cmd += "-p {0}.pairs.{1} ".format(prefix, suffix)
    else:
        cmd += "-r {0} ".format(fastqfile1)

    cmd += "-o {0}.fragments.{1}".format(prefix, suffix)
    sh(cmd, grid=opts.grid)
Ejemplo n.º 18
0
def split(args):
    """
    %prog split pairs.fastq

    Split shuffled pairs into `.1.fastq` and `.2.fastq`, using `sed`. Can work
    on gzipped file.

    <http://seqanswers.com/forums/showthread.php?t=13776>
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(split.__doc__)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pairsfastq, = args
    gz = pairsfastq.endswith(".gz")
    pf = pairsfastq.replace(".gz", "").rsplit(".", 1)[0]
    p1 = pf + ".1.fastq"
    p2 = pf + ".2.fastq"

    cmd = "zcat" if gz else "cat"
    p1cmd = cmd + " {0} | sed -ne '1~8{{N;N;N;p}}'".format(pairsfastq)
    p2cmd = cmd + " {0} | sed -ne '5~8{{N;N;N;p}}'".format(pairsfastq)

    if gz:
        p1cmd += " | gzip"
        p2cmd += " | gzip"
        p1 += ".gz"
        p2 += ".gz"

    p1cmd += " > " + p1
    p2cmd += " > " + p2

    if opts.grid:
        sh(p1cmd, grid=True)
        sh(p2cmd, grid=True)

    else:
        args = [(p1cmd, ), (p2cmd, )]
        m = Jobs(target=sh, args=args)
        m.run()

        checkShuffleSizes(p1, p2, pairsfastq)
Ejemplo n.º 19
0
def sampe(args):
    """
    %prog sampe database.fasta read1.fq read2.fq

    Wrapper for `bwa sampe`. Output will be read1.sam.
    """
    p = OptionParser(sampe.__doc__)
    p.add_option("--bam",
                 default=False,
                 action="store_true",
                 help="write to bam file [default: %default]")
    p.add_option("--cpus",
                 default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, read1file, read2file = args
    safile = check_index(dbfile, grid=grid)
    sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus)
    sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus)

    prefix = read1file.rsplit(".", 1)[0]
    samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam")
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
        return

    cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file,
                                                  read1file, read2file)
    cmd += "{0}".format(extra)
    if opts.bam:
        cmd += " | samtools view -bS -F 4 - "
    sh(cmd, grid=grid, outfile=samfile)
Ejemplo n.º 20
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bwa index`. Same interface, only adds grid submission.
    """
    p = OptionParser(index.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, = args
    safile = check_index(dbfile, grid=grid)
Ejemplo n.º 21
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bwa index`. Same interface, only adds grid submission.
    """
    p = OptionParser(index.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, = args
    safile = check_index(dbfile, grid=grid)
Ejemplo n.º 22
0
def convert(args):
    """
    %prog convert in.fastq out.fastq

    illumina fastq quality encoding uses offset 64, and sanger uses 33. This
    script creates a new file with the correct encoding
    """
    supported_qvs = ("illumina", "sanger")
    p = OptionParser(convert.__doc__)
    p.add_option("-Q", dest="infastq", default="illumina", choices=supported_qvs,
            help="input qv, one of {0} [default: %default]".\
                format("|".join(supported_qvs)))
    p.add_option("-q", dest="outfastq", default="sanger", choices=supported_qvs,
            help="output qv, one of {0} [default: %default]".\
                format("|".join(supported_qvs)))
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    infastq, outfastq = args

    from jcvi.apps.command import EMBOSSPATH

    seqret = EMBOSSPATH("seqret")
    if infastq.endswith(".gz"):
        cmd = "zcat {0} | ".format(infastq)
        cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\
                format(opts.infastq, opts.outfastq)
        cmd += " | gzip > {0}".format(outfastq)
    else:
        cmd = seqret + " fastq-{0}::{1} fastq-{2}::{3}".\
                format(opts.infastq, infastq, opts.outfastq, outfastq)

    sh(cmd, grid=opts.grid)

    return outfastq
Ejemplo n.º 23
0
def sampe(args):
    """
    %prog sampe database.fasta read1.fq read2.fq

    Wrapper for `bwa sampe`. Output will be read1.sam.
    """
    p = OptionParser(sampe.__doc__)
    p.add_option("--bam", default=False, action="store_true",
                 help="write to bam file [default: %default]")
    p.add_option("--cpus", default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, read1file, read2file = args
    safile = check_index(dbfile, grid=grid)
    sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus)
    sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus)

    prefix = read1file.rsplit(".", 1)[0]
    samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam")
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
        return

    cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file,
            read1file, read2file)
    cmd += "{0}".format(extra)
    if opts.bam:
        cmd += " | samtools view -bS -F 4 - "
    sh(cmd, grid=grid, outfile=samfile)
Ejemplo n.º 24
0
def aln(args):
    """
    %prog aln database.fasta *.fastq

    Wrapper for `bwa aln` except this will run over a set of files.
    """
    p = OptionParser(aln.__doc__)
    p.add_option("--cpus", default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfiles = args[0], args[1:]
    safile = check_index(dbfile, grid=grid)
    for readfile in readfiles:
        saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
Ejemplo n.º 25
0
Archivo: cas.py Proyecto: bennyyu/jcvi
def info(args):
    """
    %prog info casfile <fastafile>

    Wraps around `assembly_info` and get the following block.

    General info:
    Read info:
    Coverage info:

    In particular, the read info will be reorganized so that it shows the
    percentage of unmapped, mapped, unique and multi-hit reads.

    When --coverage is used, the program expects a second fastafile to replace
    the contig IDs with real ones.

    RPKM = 10^9 x C / NL, which is really just simply C/N

    C = the number of mappable reads that felt onto the gene's exons
    N = total number of mappable reads in the experiment
    L = the sum of the exons in base pairs.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(info.__doc__)
    p.add_option("--coverage", default=False, action="store_true",
            help="Generate coverage output, replacing IDs [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    casfile = args[0]
    pf = casfile.rsplit(".", 1)[0]

    if opts.coverage:
        assert len(args) == 2, "You need a fastafile when using --coverage"
        coveragefile = pf + ".coverage"
        fw = open(coveragefile, "w")

    infofile = pf + ".info"
    cmd = "assembly_info {0}".format(casfile)
    if not op.exists(infofile):
        sh(cmd, outfile=infofile, grid=opts.grid)

    inreadblock = False
    incontigblock = False

    fp = open(infofile)
    row = fp.readline()
    while row:
        if row.startswith("Read info:"):
            inreadblock = True
        elif row.startswith("Contig info:"):
            incontigblock = True

        # Following looks like a hack, but to keep compatible between
        # CLC 3.20 and CLC 4.0 beta
        if inreadblock:
            atoms = row.split('s')

            last = atoms[-1].split()[0] if len(atoms) > 1 else "0"
            srow = row.strip()

            if srow.startswith("Reads"):
                reads = int(last)
            if srow.startswith("Unmapped") or srow.startswith("Unassembled"):
                unmapped = int(last)
            if srow.startswith("Mapped") or srow.startswith("Assembled"):
                mapped = int(last)
            if srow.startswith("Multi"):
                multihits = int(last)

            if row.startswith("Coverage info:"):
                # Print the Read info: block
                print "Read info:"
                assert mapped + unmapped == reads

                unique = mapped - multihits
                print
                print "Total reads: {0}".format(reads)
                print "Unmapped reads: {0}".format(percentage(unmapped, reads, False))
                print "Mapped reads: {0}".format(percentage(mapped, reads, False))
                print "Unique reads: {0}".format(percentage(unique, reads, False))
                print "Multi hit reads: {0}".\
                        format(percentage(multihits, reads, False))
                print
                inreadblock = False

        if incontigblock and opts.coverage:

            fastafile = args[1]
            s = Sizes(fastafile)
            while row:
                atoms = row.split()
                if len(atoms) == 4 and atoms[0][0] != "C":  # Contig
                    # Contig       Sites       Reads     Coverage
                    contig, sites, reads, coverage = atoms
                    contig = int(contig) - 1
                    size = s.sizes[contig]
                    contig = s.ctgs[contig]
                    assert size == int(sites)

                    # See formula above
                    rpkm = 1e9 * int(reads) / (size * mapped)
                    print >> fw, "\t".join((contig, sites, reads,
                        "{0:.1f}".format(rpkm)))

                row = fp.readline()

        row = fp.readline()
Ejemplo n.º 26
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=10, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=30, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)
Ejemplo n.º 27
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >>sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
                lock, lastz_bin, extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Ejemplo n.º 28
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path",
                 default=op.join("~/bin", TrimJar),
                 help="Path to trimmomatic [default: %default]")
    p.add_option(
        "--phred",
        default=None,
        choices=phdchoices,
        help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags",
                 default=False,
                 action="store_true",
                 help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv",
                 default=10,
                 type="int",
                 help="Average qv after trimming [default: %default]")
    p.add_option("--minlen",
                 default=30,
                 type="int",
                 help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz",
                 default=False,
                 action="store_true",
                 help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)
Ejemplo n.º 29
0
def info(args):
    """
    %prog info casfile <fastafile>

    Wraps around `assembly_info` and get the following block.

    General info:
    Read info:
    Coverage info:

    In particular, the read info will be reorganized so that it shows the
    percentage of unmapped, mapped, unique and multi-hit reads.

    When --coverage is used, the program expects a second fastafile to replace
    the contig IDs with real ones.

    RPKM = 10^9 x C / NL, which is really just simply C/N

    C = the number of mappable reads that felt onto the gene's exons
    N = total number of mappable reads in the experiment
    L = the sum of the exons in base pairs.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(info.__doc__)
    p.add_option(
        "--coverage",
        default=False,
        action="store_true",
        help="Generate coverage output, replacing IDs [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    casfile = args[0]
    pf = casfile.rsplit(".", 1)[0]

    if opts.coverage:
        assert len(args) == 2, "You need a fastafile when using --coverage"
        coveragefile = pf + ".coverage"
        fw = open(coveragefile, "w")

    infofile = pf + ".info"
    cmd = "assembly_info {0}".format(casfile)
    if not op.exists(infofile):
        sh(cmd, outfile=infofile, grid=opts.grid)

    inreadblock = False
    incontigblock = False

    fp = open(infofile)
    row = fp.readline()
    while row:
        if row.startswith("Read info:"):
            inreadblock = True
        elif row.startswith("Contig info:"):
            incontigblock = True

        # Following looks like a hack, but to keep compatible between
        # CLC 3.20 and CLC 4.0 beta
        if inreadblock:
            atoms = row.split('s')

            last = atoms[-1].split()[0] if len(atoms) > 1 else "0"
            srow = row.strip()

            if srow.startswith("Reads"):
                reads = int(last)
            if srow.startswith("Unmapped") or srow.startswith("Unassembled"):
                unmapped = int(last)
            if srow.startswith("Mapped") or srow.startswith("Assembled"):
                mapped = int(last)
            if srow.startswith("Multi"):
                multihits = int(last)

            if row.startswith("Coverage info:"):
                # Print the Read info: block
                print "Read info:"
                assert mapped + unmapped == reads

                unique = mapped - multihits
                print
                print "Total reads: {0}".format(reads)
                print "Unmapped reads: {0}".format(
                    percentage(unmapped, reads, False))
                print "Mapped reads: {0}".format(
                    percentage(mapped, reads, False))
                print "Unique reads: {0}".format(
                    percentage(unique, reads, False))
                print "Multi hit reads: {0}".\
                        format(percentage(multihits, reads, False))
                print
                inreadblock = False

        if incontigblock and opts.coverage:

            fastafile = args[1]
            s = Sizes(fastafile)
            while row:
                atoms = row.split()
                if len(atoms) == 4 and atoms[0][0] != "C":  # Contig
                    # Contig       Sites       Reads     Coverage
                    contig, sites, reads, coverage = atoms
                    contig = int(contig) - 1
                    size = s.sizes[contig]
                    contig = s.ctgs[contig]
                    assert size == int(sites)

                    # See formula above
                    rpkm = 1e9 * int(reads) / (size * mapped)
                    print >> fw, "\t".join(
                        (contig, sites, reads, "{0:.1f}".format(rpkm)))

                row = fp.readline()

        row = fp.readline()
Ejemplo n.º 30
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a",
                 "-A",
                 dest="cpus",
                 default=1,
                 type="int",
                 help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path",
                 dest="lastz_path",
                 default=None,
                 help="specify LASTZ path")
    p.add_option(
        "--mask",
        dest="mask",
        default=False,
        action="store_true",
        help="treat lower-case letters as mask info [default: %default]")
    p.add_option(
        "--similar",
        default=False,
        action="store_true",
        help="Use options tuned for close comparison [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >> sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith(
        "lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin,
                 extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Ejemplo n.º 31
0
Archivo: cap3.py Proyecto: bennyyu/jcvi
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(p, "Input file options (required)",
            "Note: Please choose from and provide values for one of the following parameters")
    g1.add_option("--input_file", default=None,
            help="input file of reads [default: %default]")
    g1.add_option("--input_folder", default=None,
            help="input folder containing multi FASTA files of reads [default: %default]")
    g1.add_option("--input_file_list", default=None,
            help="list file containing paths to multi FASTA files of reads [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
            "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option("-x", "--prefix", dest="prefix", default="cap3",
            help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    set_grid(p)
    set_params(p)

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith('.fa') or file.lower().endswith('.fasta')]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning("List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile, grid=opts.grid)