Exemple #1
0
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cmds, = args
    fp = open(cmds)
    N = sum(1 for x in fp)
    fp.close()

    pf = cmds.rsplit(".", 1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, "Commands list file should not have a `.sh` extension"

    engine = get_grid_engine()
    threaded = opts.threaded or 1
    contents = arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format(N, threaded, cmds)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    outfile = "{0}.{1}.out".format(pf, "\$TASK_ID")
    errfile = "{0}.{1}.err".format(pf, "\$TASK_ID")
    p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=errfile, arr=ncmds, grid_opts=opts)
    p.start()
Exemple #2
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including CLC, BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    if aligner == "clc":
        from jcvi.apps.clc import align
        from jcvi.formats.cas import pairs as ps
    else:
        from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder, 2):
        samplefq = op.join(work, prefix + ".first.fastq")
        first([str(opts.firstN)] + p + ["-o", samplefq])

        os.chdir(work)
        align_args = [ref, op.basename(samplefq)]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Exemple #3
0
def main():
    """
    %prog scriptname.py

    create a minimal boilerplate for a new script
    """
    p = OptionParser(main.__doc__)
    p.add_option("-g",
                 "--graphic",
                 default=False,
                 action="store_true",
                 help="Create boilerplate for a graphic script")

    opts, args = p.parse_args()
    if len(args) != 1:
        sys.exit(not p.print_help())

    script, = args
    template = graphic_template if opts.graphic else default_template
    write_file(script, template)

    message = "template writes to `{0}`".format(script)
    if opts.graphic:
        message = "graphic " + message
    message = message[0].upper() + message[1:]
    logging.debug(message)
Exemple #4
0
def somatic(args):
    """
    %prog somatic ref.fasta *.bam > somatic.sh

    Useful to identify somatic mutations in each sample compared to all other
    samples. Script using SPEEDSEQ-somatic will be written to stdout.
    """
    p = OptionParser(somatic.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, bams = args[0], args[1:]
    tcmd = "~/export/speedseq/bin/speedseq somatic"
    tcmd += " -t 32 -F .2 -C 3 -q 30"
    cmds = []
    for b in bams:
        pf = b.split(".")[0]
        cmd = tcmd
        cmd += " -o {0}".format(pf)
        others = ",".join(sorted(set(bams) - set([b])))
        cmd += " {0} {1} {2}".format(ref, others, b)
        cmds.append(cmd)

    write_file("somatic.sh", "\n".join(cmds))
Exemple #5
0
def somatic(args):
    """
    %prog somatic ref.fasta *.bam > somatic.sh

    Useful to identify somatic mutations in each sample compared to all other
    samples. Script using SPEEDSEQ-somatic will be written to stdout.
    """
    p = OptionParser(somatic.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, bams = args[0], args[1:]
    tcmd = "~/export/speedseq/bin/speedseq somatic"
    tcmd += " -t 32 -F .2 -C 3 -q 30"
    cmds = []
    for b in bams:
        pf = b.split(".")[0]
        cmd = tcmd
        cmd += " -o {0}".format(pf)
        others = ",".join(sorted(set(bams) - set([b])))
        cmd += " {0} {1} {2}".format(ref, others, b)
        cmds.append(cmd)

    write_file("somatic.sh", "\n".join(cmds))
Exemple #6
0
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cmds, = args
    fp = open(cmds)
    ncmds = sum(1 for x in fp)
    fp.close()

    pf = cmds.rsplit(".", 1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, \
            "Commands list file should not have a `.sh` extension"

    contents = arraysh.format(cmds)
    write_file(runfile, contents, meta="run script")

    outfile = "{0}.{1}.out".format(pf, "\$TASK_ID")
    p = GridProcess("sh {0}".format(runfile),
                    outfile=outfile,
                    errfile=outfile,
                    arr=ncmds,
                    grid_opts=opts)
    p.start()
Exemple #7
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder):
        samplefq = []
        for i in range(2):
            samplefq.append(
                op.join(work, prefix + "_{0}.first.fastq".format(i + 1)))
            first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]])

        os.chdir(work)
        align_args = [ref] + [op.basename(fq) for fq in samplefq]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""),
                                            i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Exemple #8
0
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cmds, = args
    fp = open(cmds)
    ncmds = sum(1 for x in fp)
    fp.close()

    pf = cmds.rsplit(".",  1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, \
            "Commands list file should not have a `.sh` extension"

    contents = arraysh.format(cmds)
    write_file(runfile, contents, meta="run script")

    outfile = "{0}.{1}.out".format(pf, "\$TASK_ID")
    p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=outfile,
                    arr=ncmds, grid_opts=opts)
    p.start()
Exemple #9
0
def prepare(args):
    """
    %prog prepare genomesize *.fastq

    Prepare MERACULOUS configuation file. Genome size should be entered in Mb.
    """
    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=51, type="int", help="K-mer size")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    genomesize = float(args[0]) / 1000
    fnames = args[1:]
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    s = comment_banner("Meraculous params file") + "\n"
    s += comment_banner("Basic parameters") + "\n"
    s += "# Describe the libraries ( one line per library )\n"
    s += "# " + " ".join(header.split()) + "\n"

    libs = get_libs(fnames)
    lib_seqs = []
    rank = 0
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue
        rank += 1
        library_name = lib.library_name
        name = library_name.replace("-", "")
        wildcard = "{0}*.1.*,{0}*.2.*".format(library_name)
        rl = max(readlen([x]) for x in fs)
        lib_seq = lib.get_lib_seq(wildcard, name, rl, rank)
        lib_seqs.append(lib_seq)

    s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n"
    params = [("genome_size", genomesize),
              ("is_diploid", 0),
              ("mer_size", opts.K),
              ("num_prefix_blocks", 1),
              ("no_read_validation", 0),
              ("local_num_procs", opts.cpus)]
    s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n"

    cfgfile = "meraculous.config"
    write_file(cfgfile, s, tee=True)

    s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\
                .format(cfgfile)
    runsh = "run.sh"
    write_file(runsh, s)
Exemple #10
0
def prepare(args):
    """
    %prog prepare genomesize *.fastq

    Prepare MERACULOUS configuation file. Genome size should be entered in Mb.
    """
    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=51, type="int", help="K-mer size")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    genomesize = float(args[0]) / 1000
    fnames = args[1:]
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    s = comment_banner("Meraculous params file") + "\n"
    s += comment_banner("Basic parameters") + "\n"
    s += "# Describe the libraries ( one line per library )\n"
    s += "# " + " ".join(header.split()) + "\n"

    libs = get_libs(fnames)
    lib_seqs = []
    rank = 0
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue
        rank += 1
        library_name = lib.library_name
        name = library_name.replace("-", "")
        wildcard = "{0}*.1.*,{0}*.2.*".format(library_name)
        rl = max(readlen([x]) for x in fs)
        lib_seq = lib.get_lib_seq(wildcard, name, rl, rank)
        lib_seqs.append(lib_seq)

    s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n"
    params = [("genome_size", genomesize), ("is_diploid", 0),
              ("mer_size", opts.K), ("num_prefix_blocks", 1),
              ("no_read_validation", 0), ("local_num_procs", opts.cpus)]
    s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n"

    cfgfile = "meraculous.config"
    write_file(cfgfile, s, tee=True)

    s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\
                .format(cfgfile)
    runsh = "run.sh"
    write_file(runsh, s)
Exemple #11
0
def dn(args):
    """
    %prog dn folder

    Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain "_1_" and "_2_".
    """
    p = OptionParser(dn.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.set_home("trinity")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    paired = opts.paired
    thome = opts.trinity_home
    tfolder = folder + "_DN"

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = glob("../" + folder + "/*")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x]
        assert len(f1) == len(f2)
        r1, r2 = "left.fastq", "right.fastq"
        reads = ((f1, r1), (f2, r2))
    else:
        r = "single.fastq"
        reads = ((flist, r), )

    for fl, r in reads:
        fm = FileMerger(fl, r)
        fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity.pl")
    cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus)
    if paired:
        cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
    else:
        cmd += " --single {0}".format(reads[0][-1])

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
    os.chdir(cwd)
Exemple #12
0
def snp(args):
    """
    %prog snp reference.fasta

    Run SNP calling on GSNAP native output after apps.gsnap.align --snp. Files
    *native.gz in the current folder will be used as input.
    """
    p = OptionParser(snp.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ref, = args
    runfile = "align.sh"
    write_file(runfile, alignsh.format(opts.cpus, ref))
Exemple #13
0
def tigrload(args):
    """
    %prog tigrload db ev_type

    Load EVM results into TIGR db. Actually, just write a load.sh script. The
    ev_type should be set, e.g. "EVM1", "EVM2", etc.
    """
    p = OptionParser(tigrload.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    db, ev_type = args

    runfile = "load.sh"
    contents = EVMLOAD.format(db, ev_type)
    write_file(runfile, contents, meta="run script")
Exemple #14
0
def write_libraries(fastqs, aligner=None):
    from jcvi.assembly.base import get_libs
    libs = get_libs(fastqs)
    assert libs

    libtxt = "libraries.txt"
    contents = []
    for i, (lib, fns) in enumerate(libs):
        fns = " ".join(fns)
        pe = "RF" if lib.read_orientation == "outward" else "FR"
        cc = ["lib{0}".format(i + 1), fns, lib.size, 0.75, pe]
        if aligner:
            cc.insert(1, aligner)
        libline = " ".join(str(x) for x in cc)
        contents.append(libline)

    write_file(libtxt, "\n".join(contents), tee=True)
    return libtxt
Exemple #15
0
def tigrprepare(args):
    """
    %prog tigrprepare asmbl.fasta asmbl.ids db pasa.terminal_exons.gff3

    Run EVM in TIGR-only mode.
    """
    p = OptionParser(tigrprepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    fastafile, asmbl_id, db, pasa_db = args
    if asmbl_id == "all":
        idsfile = fastafile + ".ids"
        if need_update(fastafile, idsfile):
            ids([fastafile, "-o", idsfile])
    else:
        idsfile = asmbl_id

    oneid = next(open(idsfile)).strip()

    weightsfile = "weights.txt"
    if need_update(idsfile, weightsfile):
        cmd = "$EVM/TIGR-only/create_sample_weights_file.dbi"
        cmd += " {0} {1} | tee weights.txt".format(db, oneid)
        sh(cmd)

    evs = [
        "gene_predictions.gff3",
        "transcript_alignments.gff3",
        "protein_alignments.gff3",
    ]
    if need_update(weightsfile, evs):
        cmd = "$EVM/TIGR-only/write_GFF3_files.dbi"
        cmd += " --db {0} --asmbl_id {1} --weights {2}".format(db, idsfile, weightsfile)
        sh(cmd)

    evs[1] = fix_transcript()

    partition(evs)
    runfile = "run.sh"
    contents = EVMRUN.format(*evs)
    write_file(runfile, contents)
Exemple #16
0
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    p.set_params(prog="grid")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (cmds, ) = args
    fp = open(cmds)
    N = sum(1 for _ in fp)
    fp.close()

    pf = cmds.rsplit(".", 1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, "Commands list file should not have a `.sh` extension"

    engine = get_grid_engine()
    threaded = opts.threaded or 1
    contents = (arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format(
        N, threaded, cmds))
    write_file(runfile, contents)

    if engine == "PBS":
        return

    outfile = "{0}.{1}.out".format(pf, r"\$TASK_ID")
    errfile = "{0}.{1}.err".format(pf, r"\$TASK_ID")
    p = GridProcess(
        "sh {0}".format(runfile),
        outfile=outfile,
        errfile=errfile,
        arr=N,
        extra_opts=opts.extra,
        grid_opts=opts,
    )
    p.start()
Exemple #17
0
def tigrprepare(args):
    """
    %prog tigrprepare asmbl.fasta asmbl.ids db pasa.terminal_exons.gff3

    Run EVM in TIGR-only mode.
    """
    p = OptionParser(tigrprepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    fastafile, asmbl_id, db, pasa_db = args
    if asmbl_id == 'all':
        idsfile = fastafile + ".ids"
        if need_update(fastafile, idsfile):
            ids([fastafile, "-o", idsfile])
    else:
        idsfile = asmbl_id

    oneid = open(idsfile).next().strip()

    weightsfile = "weights.txt"
    if need_update(idsfile, weightsfile):
        cmd = "$EVM/TIGR-only/create_sample_weights_file.dbi"
        cmd += " {0} {1} | tee weights.txt".format(db, oneid)
        sh(cmd)

    evs = ["gene_predictions.gff3", "transcript_alignments.gff3",
           "protein_alignments.gff3"]
    if need_update(weightsfile, evs):
        cmd = "$EVM/TIGR-only/write_GFF3_files.dbi"
        cmd += " --db {0} --asmbl_id {1} --weights {2}".\
                format(db, idsfile, weightsfile)
        sh(cmd)

    evs[1] = fix_transcript()

    partition(evs)
    runfile = "run.sh"
    contents = EVMRUN.format(*evs)
    write_file(runfile, contents, meta="run script")
Exemple #18
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Exemple #19
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Exemple #20
0
def close(args):
    """
    %prog close scaffolds.fasta PE*.fastq

    Run GapFiller to fill gaps.
    """
    p = OptionParser(close.__doc__)
    p.set_home("gapfiller")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    scaffolds = args[0]
    libtxt = write_libraries(args[1:], aligner="bwa")

    cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #21
0
def close(args):
    """
    %prog close scaffolds.fasta PE*.fastq

    Run GapFiller to fill gaps.
    """
    p = OptionParser(close.__doc__)
    p.set_home("gapfiller")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    scaffolds = args[0]
    libtxt = write_libraries(args[1:], aligner="bwa")

    cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #22
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:])

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #23
0
def prepare(args):
    """
    %prog prepare alignAssembly.config est.fasta ref.fasta

    Generate PASA run script.
    """
    p = OptionParser(prepare.__doc__)
    p.set_home("pasa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    cfg, est, ref = args
    phome = opts.pasa_home
    cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl")
    cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus)
    cmd += " -C -R --ALIGNERS blat,gmap"
    cmd += " -t {0} -g {1}".format(est, ref)
    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Exemple #24
0
def prepare(args):
    """
    %prog prepare alignAssembly.config est.fasta ref.fasta

    Generate PASA run script.
    """
    p = OptionParser(prepare.__doc__)
    p.set_home("pasa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    cfg, est, ref = args
    phome = opts.pasa_home
    cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl")
    cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus)
    cmd += " -C -R --ALIGNERS blat,gmap"
    cmd += " -t {0} -g {1}".format(est, ref)
    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Exemple #25
0
def main():
    """
    %prog scriptname.py

    create a minimal boilerplate for a new script
    """
    p = OptionParser(main.__doc__)
    p.add_option("--graphic", default=False, action="store_true",
            help="Create boilerplate for a graphic script")

    opts, args = p.parse_args()
    if len(args) != 1:
        sys.exit(not p.print_help())

    script, = args
    template = graphic_template if opts.graphic else default_template
    write_file(script, template, meta="python script")

    message = "template writes to `{0}`".format(script)
    if opts.graphic:
        message = "graphic " + message
    message = message[0].upper() + message[1:]
    logging.debug(message)
Exemple #26
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_aligner(aligner="bwa")
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:], aligner=opts.aligner)
    # Requires getopts.pl which may be missing
    download("http://mflib.org/xampp/perl/lib/getopts.pl")

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #27
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_aligner(aligner="bwa")
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:], aligner=opts.aligner)
    # Requires getopts.pl which may be missing
    download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl")

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #28
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    fl_accs = opts.fl_accs
    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctid = opts.compreh_pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    cmds = []

    # set PASAHOME env variable if preparing shell script
    if prepare:
        env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME)
        cmds.append(env_cmd)

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        cmds.append(accn_extract_cmd)
        if not prepare:
            sh(accn_extract_cmd)
    else:
        symlink(dnfasta, tfasta)
        transcripts = tfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        ccpus = 16 if cpus >= 16 else cpus
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus)
        if prepare:
            cmds.append(cleancmd)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice), file=aafw)
    aafw.close()

    symlink(genome, gfasta)

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta)
    aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \
             " -t {0}".format(transcripts)
    if fl_accs:
        symlink(fl_accs, flaccs)
        aacmd += " -f {0}".format(flaccs)
    if ggfasta:
        aacmd += " --TDN {0}".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        cmds.append(aacmd)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov)

        if prepare:
            cmds.append(comprehcmd)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)

    if prepare:
        write_file(runfile, "\n".join(cmds))  # initialize run script
Exemple #29
0
def prepare(args):
    """
    %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN-Trinity.

    If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM
    as starting point.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_fastq_names()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]

    paired = opts.paired
    merge = opts.merge
    trinity_home = opts.trinity_home
    hpc_grid_runner_home = opts.hpcgridrunner_home

    method = "DN"
    bam = opts.bam
    if bam and op.exists(bam):
        bam = op.abspath(bam)
        method = "GG"

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    cmds = []

    # set TRINITY_HOME env variable when preparing shell script
    env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home)
    cmds.append(env_cmd)

    if method == "DN":
        assert op.exists("../" + inparam)

        flist = iglob("../" + inparam, opts.names)
        if paired:
            f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x]
            f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x]
            assert len(f1) == len(f2)
            if merge:
                r1, r2 = "left.fastq", "right.fastq"
                reads = ((f1, r1), (f2, r2))
        else:
            if merge:
                r = "single.fastq"
                reads = ((flist, r), )

        if merge:
            for fl, r in reads:
                fm = FileMerger(fl, r)
                fm.merge(checkexists=True)

    cmd = op.join(trinity_home, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)

    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome_guided_bam {0}".format(bam)
        cmd += " --genome_guided_max_intron {0}".format(opts.max_intron)
    else:
        if paired:
            if merge:
                cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
            else:
                cmd += " --left {0}".format(",".join(f1))
                cmd += " --right {0}".format(",".join(f2))
        else:
            if merge:
                 cmd += " --single {0}".format(reads[0][-1])
            else:
                for f in flist:
                    cmd += " --single {0}".format(f)

    if opts.grid and opts.grid_conf_file:
        hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl")
        hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file)
        assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file)

        cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file)

    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmds.append(cmd)

    if opts.cleanup:
        cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \
            if method == "DN" else \
            'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")'
        cmd.append(cleanup_cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(cmds))
    os.chdir(cwd)
Exemple #30
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding.
    """
    from jcvi.formats.base import write_file

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K",
                 default=45,
                 type="int",
                 help="K-mer size [default: %default]")
    p.add_option(
        "--assemble_1st_rank_only",
        default=False,
        action="store_true",
        help=
        "Assemble the first rank only, other libs asm_flags=2 [default: %default]"
    )
    p.add_option("--scaffold",
                 help="Only perform scaffolding [default: %default]")
    p.add_option("--gapclose",
                 help="Only perform gap closure [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    K = opts.K
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    a1st = opts.assemble_1st_rank_only

    cfgfile = "soap.config"
    gc_cfgfile = "soap.gc.config"
    fw = open(cfgfile, "w")
    fw_gc = open(gc_cfgfile, "w")

    libs = get_libs(fnames)
    rank = 0
    singletons = []
    max_rd_len = max(readlen([f]) for f in fnames)

    block = "max_rd_len={0}\n".format(max_rd_len)
    for stream in (sys.stderr, fw, fw_gc):
        print(block, file=stream)

    # Collect singletons first
    singletons = []
    for lib, fs in libs:
        if lib.size == 0:
            singletons += fs
            continue

    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        block += "reverse_seq={0}\n".format(lib.reverse_seq)
        asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags
        block += "asm_flags={0}\n".format(asm_flags)
        block += "rank={0}\n".format(rank)
        if lib.reverse_seq:
            pair_num_cutoff = 3
            block += "pair_num_cutoff={0}\n".format(pair_num_cutoff)
        block += "map_len=35\n"

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            block += "{0}={1}\n".format(tag, f)

        if rank == 1:
            for s in singletons:
                tag = "q" if is_fastq(s) else "f"
                block += tag + "={0}\n".format(s)

        print(block, file=sys.stderr)
        print(block, file=fw)

        if asm_flags > 2:
            print(block, file=fw_gc)

    runfile = "run.sh"
    scaffold = opts.scaffold
    bb = 63 if K <= 63 else 127
    binary = "SOAPdenovo-{0}mer".format(bb)
    header = SOAPHEADER.format(opts.cpus, K, binary)
    if opts.gapclose:
        gapclose = opts.gapclose
        outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta"
        template = header + GCRUNG.format(gapclose, outfile)
    else:
        template = header + (SCFRUN % scaffold if scaffold else SOAPRUN)

    write_file(runfile, template)
    fw.close()
    fw_gc.close()
Exemple #31
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Exemple #32
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
Exemple #33
0
def prepare(args):
    """
    %prog prepare *.fastq

    Generate run.sh script to run clc_novo_assemble.
    """
    from itertools import groupby

    from jcvi.assembly.base import FastqNamings, Library

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    singletons = []
    pairs = []

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    for lib, fs in libs:
        size = lib.size
        stddev = lib.stddev

        if size == 0:
            singletons += fs
            continue

        for f in fs:

            reverse_seq = 0 if ".corr." in f else lib.reverse_seq
            fb = "bf" if reverse_seq else "fb"
            minsize, maxsize = size - 2 * stddev, size + 2 * stddev
            pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize)

            if ".1." in f:
                f = f.replace(".1.", ".?.")
                pairs.append(pair_opt + "-i {0}".format(f))
            elif ".2." in f:
                continue
            else:
                pairs.append(pair_opt + f)

    cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format(opts.cpus)
    cmd += "\t-q {0} \\\n".format(" ".join(singletons))
    cmd += "\n".join("\t{0} \\".format(x) for x in pairs)

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Exemple #34
0
def prepare(args):
    """
    %prog prepare barcode_key.csv reference.fasta

    Prepare TASSEL pipeline.
    """
    valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \
                    "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \
                    "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \
                    "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|")
    p = OptionParser(prepare.__doc__)
    p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes,
                 help="Restriction enzyme used [default: %default]")
    p.set_home("tassel")
    p.set_aligner(aligner="bwa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    barcode, reference = args
    thome = opts.tassel_home
    reference = get_abs_path(reference)
    folders = ("fastq", "tagCounts", "mergedTagCounts", "topm",
               "tbt", "mergedTBT", "hapmap", "hapmap/raw",
               "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec")
    for f in folders:
        mkdir(f)

    # Build the pipeline
    runsh = []
    o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme)
    cmd = run_pipeline(thome, "FastqToTagCountPlugin", o)
    runsh.append(cmd)

    o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt"
    o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq"
    cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o)
    runsh.append(cmd)
    runsh.append("cd mergedTagCounts")

    cmd = "python -m jcvi.apps.{0} align --cpus {1}".\
                format(opts.aligner, opts.cpus)
    cmd += " {0} myMasterTags.cnt.fq".format(reference)
    runsh.append(cmd)
    runsh.append("cd ..")

    o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm"
    cmd = run_pipeline(thome, "SAMConverterPlugin", o)
    runsh.append(cmd)

    o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm"
    o += " -mUpd topm/myMasterTagsWithVariants.topm"
    o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000"
    o += " -ref {0} -sC 1 -eC 10".format(reference)
    cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10"
    cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt"
    o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10"
    #o += "-hLD -mnR2 0.2 -mnBonP 0.005"
    cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o)
    runsh.append(cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(runsh))
Exemple #35
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents, meta="run script")
Exemple #36
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile,
                    outfile=outfile,
                    errfile=outfile,
                    arr=ncmds,
                    grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Exemple #37
0
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    p.add_option("--fraction", default=0.5,
            help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity", default=0.95,
            help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Exemple #38
0
def prepare(args):
    """
    %prog prepare *.fastq

    Generate run.sh script to run clc_novo_assemble.
    """
    from itertools import groupby

    from jcvi.assembly.base import FastqNamings, Library

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    singletons = []
    pairs = []

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    for lib, fs in libs:
        size = lib.size
        stddev = lib.stddev

        if size == 0:
            singletons += fs
            continue

        for f in fs:

            reverse_seq = 0 if ".corr." in f else lib.reverse_seq
            fb = "bf" if reverse_seq else "fb"
            minsize, maxsize = size - 2 * stddev, size + 2 * stddev
            pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize)

            if ".1." in f:
                f = f.replace(".1.", ".?.")
                pairs.append(pair_opt + "-i {0}".format(f))
            elif ".2." in f:
                continue
            else:
                pairs.append(pair_opt + f)

    cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format(opts.cpus)
    cmd += "\t-q {0} \\\n".format(" ".join(singletons))
    cmd += "\n".join("\t{0} \\".format(x) for x in pairs)

    runfile = "run.sh"
    write_file(runfile, cmd)
Exemple #39
0
def maker(args):
    """
    %prog maker maker.gff3 genome.fasta

    Prepare EVM inputs by separating tracks from MAKER.
    """
    from jcvi.formats.base import SetFile, FileShredder

    A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN"
    # Stores default weights and types
    Registry = {\
        "maker": (A, 5),
        "augustus_masked": (A, 1),
        "snap_masked": (A, 1),
        "genemark": (A, 1),
        "est2genome": (T, 5),
        "est_gff": (T, 5),
        "protein2genome": (P, 5),
        "blastx": (P, 1)
    }

    p = OptionParser(maker.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, fastafile = args

    types = "type.ids"
    if need_update(gffile, types):
        cmd = "cut -f2 -s {0} | sort -u".format(gffile)
        sh(cmd, outfile=types)

    types = SetFile(types)
    reg = defaultdict(list)
    weightsfile = "weights.txt"
    contents = []
    for s in types:
        rs = s.split(":")[0]
        if rs not in Registry:
            continue

        type, weight = Registry[rs]
        reg[type].append(s)
        contents.append("\t".join(str(x) for x in (type, s, weight)))

    contents = "\n".join(sorted(contents))
    write_file(weightsfile, contents, meta="weights file")

    evs = [x + ".gff" for x in (A, T, P)]
    FileShredder(evs)

    for type, tracks in reg.items():
        for t in tracks:
            cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type)
            sh(cmd)

    partition(evs)
    runfile = "run.sh"
    contents = EVMRUN.format(*evs)
    write_file(runfile, contents, meta="run script")
Exemple #40
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Exemple #41
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta",
                 default=False,
                 action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual",
                 default=False,
                 action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append(
        'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Exemple #42
0
def prepare(args):
    """
    %prog prepare barcode_key.csv reference.fasta

    Prepare TASSEL pipeline.
    """
    valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \
                    "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \
                    "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \
                    "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|")
    p = OptionParser(prepare.__doc__)
    p.add_option("--enzyme",
                 default="ApeKI",
                 choices=valid_enzymes,
                 help="Restriction enzyme used [default: %default]")
    p.set_home("tassel")
    p.set_aligner(aligner="bwa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    barcode, reference = args
    thome = opts.tassel_home
    reference = get_abs_path(reference)
    folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt",
               "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs",
               "hapmap/filt", "hapmap/bpec")
    for f in folders:
        mkdir(f)

    # Build the pipeline
    runsh = []
    o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme)
    cmd = run_pipeline(thome, "FastqToTagCountPlugin", o)
    runsh.append(cmd)

    o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt"
    o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq"
    cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o)
    runsh.append(cmd)
    runsh.append("cd mergedTagCounts")

    cmd = "python -m jcvi.apps.{0} align --cpus {1}".\
                format(opts.aligner, opts.cpus)
    cmd += " {0} myMasterTags.cnt.fq".format(reference)
    runsh.append(cmd)
    runsh.append("cd ..")

    o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm"
    cmd = run_pipeline(thome, "SAMConverterPlugin", o)
    runsh.append(cmd)

    o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm"
    o += " -mUpd topm/myMasterTagsWithVariants.topm"
    o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000"
    o += " -ref {0} -sC 1 -eC 10".format(reference)
    cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10"
    cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt"
    o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10"
    #o += "-hLD -mnR2 0.2 -mnBonP 0.005"
    cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o)
    runsh.append(cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(runsh), meta="run script")
Exemple #43
0
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    p.add_option("--fraction", default=0.5,
            help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity", default=0.95,
            help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Exemple #44
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
Exemple #45
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired",
                 default=False,
                 action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    assert op.exists(inparam)

    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, opts.names)
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(
        opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(
            genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
            cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmd += " --bypass_java_version_check"

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Exemple #46
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
             cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Exemple #47
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()
Exemple #48
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()
Exemple #49
0
def prepare(args):
    """
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    """
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import write_file
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--corr", default=False, action="store_true",
                 help="Extra parameters for corrected data [default: %default]")
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    offset = guessoffset([fnames[0]])
    phred64 = offset == 64

    assert all(guessoffset([x]) == offset for x in fnames[1:])

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:
            continue

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:
            libs.append(library_name)

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\
        format(len(groupcontents)))

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\
        format(len(libcontents)))

    runfile = "run.sh"

    extra = ""
    if opts.corr:
        extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0"
        extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1"

    if not opts.norun:
        contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra)
        write_file(runfile, contents)
Exemple #50
0
def compare(args):
    """
    %prog compare pasa_db_name [--annots_gff3=annotation.gff3]

    Run the PASA annotation comparison pipeline

    This assumes that PASA alignment assembly has alredy been completed and
    run directory contains `genome.fasta` and `transcript.fasta` files.

    If `--annots_gff3` is specified, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    pasa_db, = args

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    annots_gff3 = opts.annots_gff3
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "", append=True, skipcheck=True)  # initialize run script

    acfw = must_open(acconf, "w")
    print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw)
    acfw.close()

    if not op.exists(gfasta):
        sys.exit("Genome fasta file `{0}` does not exist".format(gfasta))

    transcripts = tfasta
    if not op.exists(transcripts):
        sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts))

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, gfasta, transcripts, opts.genetic_code)

    if annots_gff3:
        if not op.exists(annots_gff3):
            sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3))
        symlink(annots_gff3, annotation)
        accmd += " -L --annots_gff3 {0}".format(annotation)

    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Exemple #51
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding.
    """
    from jcvi.formats.base import write_file

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]")
    p.add_option(
        "--assemble_1st_rank_only",
        default=False,
        action="store_true",
        help="Assemble the first rank only, other libs asm_flags=2 [default: %default]",
    )
    p.add_option("--scaffold", help="Only perform scaffolding [default: %default]")
    p.add_option("--gapclose", help="Only perform gap closure [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    a1st = opts.assemble_1st_rank_only

    cfgfile = "soap.config"
    gc_cfgfile = "soap.gc.config"
    fw = open(cfgfile, "w")
    fw_gc = open(gc_cfgfile, "w")

    libs = get_libs(fnames)
    rank = 0
    singletons = []
    max_rd_len = max(readlen([f]) for f in fnames)

    block = "max_rd_len={0}\n".format(max_rd_len)
    for stream in (sys.stderr, fw, fw_gc):
        print >> stream, block

    # Collect singletons first
    singletons = []
    for lib, fs in libs:
        if lib.size == 0:
            singletons += fs
            continue

    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        f = fs[0]
        block += "reverse_seq={0}\n".format(lib.reverse_seq)
        asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags
        block += "asm_flags={0}\n".format(asm_flags)
        block += "rank={0}\n".format(rank)
        if lib.reverse_seq:
            pair_num_cutoff = 3
            block += "pair_num_cutoff={0}\n".format(pair_num_cutoff)
        block += "map_len=35\n"

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            block += "{0}={1}\n".format(tag, f)

        if rank == 1:
            for s in singletons:
                block += "q={0}\n".format(s)

        print >>sys.stderr, block
        print >> fw, block

        if asm_flags > 2:
            print >> fw_gc, block

    runfile = "run.sh"
    scaffold = opts.scaffold
    header = SOAPHEADER.format(opts.cpus, opts.K)
    if opts.gapclose:
        gapclose = opts.gapclose
        outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta"
        template = header + GCRUNG.format(gapclose, outfile)
    else:
        template = header + (SCFRUN % scaffold if scaffold else SOAPRUN)

    write_file(runfile, template, meta="run script")
    fw.close()
    fw_gc.close()
Exemple #52
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile, outfile=outfile, errfile=outfile,
                    arr=ncmds, grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)