Beispiel #1
0
def aln(args):
    """
    %prog aln database.fasta *.fastq

    Wrapper for `bwa aln` except this will run over a set of files.
    """
    p = OptionParser(aln.__doc__)
    p.add_option("--cpus",
                 default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfiles = args[0], args[1:]
    safile = check_index(dbfile, grid=grid)
    for readfile in readfiles:
        saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
Beispiel #2
0
def bwasw(args):
    """
    %prog bwasw database.fasta long_read.fastq

    Wrapper for `bwa bwasw`. Output will be long_read.sam.
    """
    p = OptionParser(bwasw.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfile = args
    safile = check_index(dbfile, grid=grid)
    saifile = check_aln(dbfile, readfile, grid=grid)

    samfile = readfile.rsplit(".", 1)[0] + ".sam"
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
        return

    cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile)
    cmd += "{0}".format(extra)
    sh(cmd, grid=grid, outfile=samfile)
Beispiel #3
0
def bwasw(args):
    """
    %prog bwasw database.fasta long_read.fastq

    Wrapper for `bwa bwasw`. Output will be long_read.sam.
    """
    p = OptionParser(bwasw.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfile = args
    safile = check_index(dbfile, grid=grid)
    saifile = check_aln(dbfile, readfile, grid=grid)

    samfile = readfile.rsplit(".", 1)[0] + ".sam"
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
        return

    cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile)
    cmd += "{0}".format(extra)
    sh(cmd, grid=grid, outfile=samfile)
Beispiel #4
0
def map(args):
    """
    %prog map reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(map.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)
    if len(args) < 2:
        sys.exit(not p.print_help())

    license = "license.properties"
    if not op.exists(license):
        sh("cp ~/{0} .".format(license))

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus 16"
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:
        cmd += " -l 0.8 -s 0.98"

    sh(cmd, grid=opts.grid)
Beispiel #5
0
def map(args):
    """
    %prog map reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(map.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)
    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus 16"
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:
        cmd += " -l 0.8 -s 0.98"

    sh(cmd, grid=opts.grid)
Beispiel #6
0
def sampe(args):
    """
    %prog sampe database.fasta read1.fq read2.fq

    Wrapper for `bwa sampe`. Output will be read1.sam.
    """
    p = OptionParser(sampe.__doc__)
    p.add_option("--bam",
                 default=False,
                 action="store_true",
                 help="write to bam file [default: %default]")
    p.add_option("--cpus",
                 default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, read1file, read2file = args
    safile = check_index(dbfile, grid=grid)
    sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus)
    sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus)

    prefix = read1file.rsplit(".", 1)[0]
    samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam")
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
        return

    cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file,
                                                  read1file, read2file)
    cmd += "{0}".format(extra)
    if opts.bam:
        cmd += " | samtools view -bS -F 4 - "
    sh(cmd, grid=grid, outfile=samfile)
Beispiel #7
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bwa index`. Same interface, only adds grid submission.
    """
    p = OptionParser(index.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, = args
    safile = check_index(dbfile, grid=grid)
Beispiel #8
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bwa index`. Same interface, only adds grid submission.
    """
    p = OptionParser(index.__doc__)
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, = args
    safile = check_index(dbfile, grid=grid)
Beispiel #9
0
def sampe(args):
    """
    %prog sampe database.fasta read1.fq read2.fq

    Wrapper for `bwa sampe`. Output will be read1.sam.
    """
    p = OptionParser(sampe.__doc__)
    p.add_option("--bam", default=False, action="store_true",
                 help="write to bam file [default: %default]")
    p.add_option("--cpus", default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, read1file, read2file = args
    safile = check_index(dbfile, grid=grid)
    sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus)
    sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus)

    prefix = read1file.rsplit(".", 1)[0]
    samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam")
    if op.exists(samfile):
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
        return

    cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file,
            read1file, read2file)
    cmd += "{0}".format(extra)
    if opts.bam:
        cmd += " | samtools view -bS -F 4 - "
    sh(cmd, grid=grid, outfile=samfile)
Beispiel #10
0
def aln(args):
    """
    %prog aln database.fasta *.fastq

    Wrapper for `bwa aln` except this will run over a set of files.
    """
    p = OptionParser(aln.__doc__)
    p.add_option("--cpus", default=32,
                 help="Number of cpus to use [default: %default]")
    set_params(p)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(p.print_help())

    extra = opts.extra
    grid = opts.grid

    dbfile, readfiles = args[0], args[1:]
    safile = check_index(dbfile, grid=grid)
    for readfile in readfiles:
        saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
Beispiel #11
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >>sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
                lock, lastz_bin, extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Beispiel #12
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(p, "Input file options (required)",
            "Note: Please choose from and provide values for one of the following parameters")
    g1.add_option("--input_file", default=None,
            help="input file of reads [default: %default]")
    g1.add_option("--input_folder", default=None,
            help="input folder containing multi FASTA files of reads [default: %default]")
    g1.add_option("--input_file_list", default=None,
            help="list file containing paths to multi FASTA files of reads [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
            "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option("-x", "--prefix", dest="prefix", default="cap3",
            help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    set_grid(p)
    set_params(p)

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith('.fa') or file.lower().endswith('.fasta')]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning("List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile, grid=opts.grid)
Beispiel #13
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a",
                 "-A",
                 dest="cpus",
                 default=1,
                 type="int",
                 help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path",
                 dest="lastz_path",
                 default=None,
                 help="specify LASTZ path")
    p.add_option(
        "--mask",
        dest="mask",
        default=False,
        action="store_true",
        help="treat lower-case letters as mask info [default: %default]")
    p.add_option(
        "--similar",
        default=False,
        action="store_true",
        help="Use options tuned for close comparison [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >> sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith(
        "lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin,
                 extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Beispiel #14
0
def main(args):
    """
    %prog database.fasta query.fasta


    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """

    supported_formats = ("tab", "maf", "blast")

    p = OptionParser(main.__doc__)
    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--path", help="specify LAST path")
    p.add_option("--format", default="blast", choices=supported_formats,
                 help="Output format, one of {0} [default: %default]".\
                      format("|".join(supported_formats)))
    p.add_option("--eval", default=False, action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")

    set_params(p)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd)

    cmd = "{0} -u 0".format(lastal_bin)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb)

    out_fh = must_open(opts.outfile, "w")
    lock = Lock()

    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()