Exemple #1
0
def dn(args):
    """
    %prog dn folder

    Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain "_1_" and "_2_".
    """
    p = OptionParser(dn.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.set_home("trinity")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    paired = opts.paired
    thome = opts.trinity_home
    tfolder = folder + "_DN"

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = glob("../" + folder + "/*")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x]
        assert len(f1) == len(f2)
        r1, r2 = "left.fastq", "right.fastq"
        reads = ((f1, r1), (f2, r2))
    else:
        r = "single.fastq"
        reads = ((flist, r), )

    for fl, r in reads:
        fm = FileMerger(fl, r)
        fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity.pl")
    cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus)
    if paired:
        cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
    else:
        cmd += " --single {0}".format(reads[0][-1])

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
    os.chdir(cwd)
Exemple #2
0
def build(args):
    """
    %prog build input.bed scaffolds.fasta

    Build associated genome FASTA file and CHAIN file that can be used to lift
    old coordinates to new coordinates. The CHAIN file will be used to lift the
    original marker positions to new positions in the reconstructed genome. The
    new positions of the markers will be reported in *.lifted.bed.
    """
    p = OptionParser(build.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    chr_fasta = pf + ".chr.fasta"
    if need_update((chr_agp, scaffolds), chr_fasta):
        agp_build([chr_agp, scaffolds, chr_fasta])

    unplaced_agp = pf + ".unplaced.agp"
    if need_update((chr_agp, scaffolds), unplaced_agp):
        write_unplaced_agp(chr_agp, scaffolds, unplaced_agp)

    unplaced_fasta = pf + ".unplaced.fasta"
    if need_update((unplaced_agp, scaffolds), unplaced_fasta):
        agp_build([unplaced_agp, scaffolds, unplaced_fasta])

    combined_agp = pf + ".agp"
    if need_update((chr_agp, unplaced_agp), combined_agp):
        FileMerger((chr_agp, unplaced_agp), combined_agp).merge()

    combined_fasta = pf + ".fasta"
    if need_update((chr_fasta, unplaced_fasta), combined_fasta):
        FileMerger((chr_fasta, unplaced_fasta), combined_fasta).merge()

    chainfile = pf + ".chain"
    if need_update((combined_agp, scaffolds, combined_fasta), chainfile):
        fromagp([combined_agp, scaffolds, combined_fasta])

    liftedbed = mapbed.rsplit(".", 1)[0] + ".lifted.bed"
    if need_update((mapbed, chainfile), liftedbed):
        cmd = "liftOver -minMatch=1 {0} {1} {2} unmapped".\
                format(mapbed, chainfile, liftedbed)
        sh(cmd)

    sort([liftedbed, "-i"])  # Sort bed in place
Exemple #3
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.add_option("--outdir",
                 default="outdir",
                 help="Output final reads in [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Exemple #4
0
def refine(args):
    """
    %prog refine breakpoints.bed gaps.bed

    Find gaps within or near breakpoint region.
    """
    p = OptionParser(refine.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    breakpointsbed, gapsbed = args
    ncols = len(open(breakpointsbed).next().split())
    logging.debug("File {0} contains {1} columns.".format(
        breakpointsbed, ncols))
    cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed)

    pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0])
    ingapsbed = pf + ".bed"
    sh(cmd, outfile=ingapsbed)

    fp = open(ingapsbed)
    data = [x.split() for x in fp]

    nogapsbed = pf + ".nogaps.bed"
    largestgapsbed = pf + ".largestgaps.bed"
    nogapsfw = open(nogapsbed, "w")
    largestgapsfw = open(largestgapsbed, "w")
    for b, gaps in groupby(data, key=lambda x: x[:ncols]):
        gaps = list(gaps)
        gap = gaps[0]
        if len(gaps) == 1 and gap[-1] == "0":
            assert gap[-2] == "."
            print >> nogapsfw, "\t".join(b)
            continue

        gaps = [(int(x[-1]), x) for x in gaps]
        maxgap = max(gaps)[1]
        print >> largestgapsfw, "\t".join(maxgap)

    nogapsfw.close()
    largestgapsfw.close()

    closestgapsbed = pf + ".closestgaps.bed"
    closestgapsfw = open(closestgapsbed, "w")
    cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed)
    sh(cmd, outfile=closestgapsbed)

    refinedbed = pf + ".refined.bed"
    FileMerger([largestgapsbed, closestgapsbed], outfile=refinedbed).merge()

    # Clean-up
    toclean = [nogapsbed, largestgapsbed, closestgapsbed]
    FileShredder(toclean)

    return refinedbed
Exemple #5
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species",
                 default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3",
                 default=False,
                 action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(
        augustuswrap,
        species=opts.species,
        gff3=gff3,
        cfgfile=cfgfile,
        hintsfile=opts.hintsfile,
    )
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus

        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Exemple #6
0
def parallel_musclewrap(clustfile, cpus, minsamp=0):
    musclewrap_minsamp = partial(musclewrap, minsamp=minsamp)
    if cpus == 1:
        return musclewrap_minsamp(clustfile)

    from jcvi.apps.grid import Jobs

    outdir = mkdtemp(dir=".")
    fs = split([clustfile, outdir, str(cpus), "--format=clust"])
    g = Jobs(musclewrap_minsamp, fs.names)
    g.run()

    clustnames = [x.replace(".clust", ".clustS") for x in fs.names]
    clustSfile = clustfile.replace(".clust", ".clustS")
    FileMerger(clustnames, outfile=clustSfile).merge()
    shutil.rmtree(outdir)
Exemple #7
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--conf",
                 help="BAMBUS configuration file [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]"
    )
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    ctgfasta = args[0]
    duos = list(grouper(2, args[1:]))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra"]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Exemple #8
0
def refine(args):
    """
    %prog refine breakpoints.bed gaps.bed

    Find gaps within or near breakpoint region.

    For breakpoint regions with no gaps, there are two options:
    - Break in the middle of the region
    - Break at the closest gap (--closest)
    """
    p = OptionParser(refine.__doc__)
    p.add_option(
        "--closest",
        default=False,
        action="store_true",
        help="In case of no gaps, use closest",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    breakpointsbed, gapsbed = args
    ncols = len(open(breakpointsbed).next().split())
    logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols))
    cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed)

    pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0])
    ingapsbed = pf + ".bed"
    sh(cmd, outfile=ingapsbed)

    fp = open(ingapsbed)
    data = [x.split() for x in fp]

    nogapsbed = pf + ".nogaps.bed"
    largestgapsbed = pf + ".largestgaps.bed"
    nogapsfw = open(nogapsbed, "w")
    largestgapsfw = open(largestgapsbed, "w")
    for b, gaps in groupby(data, key=lambda x: x[:ncols]):
        gaps = list(gaps)
        gap = gaps[0]
        if len(gaps) == 1 and gap[-1] == "0":
            assert gap[-3] == "."
            print("\t".join(b), file=nogapsfw)
            continue

        gaps = [(int(x[-1]), x) for x in gaps]
        maxgap = max(gaps)[1]
        print("\t".join(maxgap), file=largestgapsfw)

    nogapsfw.close()
    largestgapsfw.close()
    beds = [largestgapsbed]
    toclean = [nogapsbed, largestgapsbed]

    if opts.closest:
        closestgapsbed = pf + ".closestgaps.bed"
        cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed)
        sh(cmd, outfile=closestgapsbed)
        beds += [closestgapsbed]
        toclean += [closestgapsbed]
    else:
        pointbed = pf + ".point.bed"
        pbed = Bed()
        bed = Bed(nogapsbed)
        for b in bed:
            pos = (b.start + b.end) / 2
            b.start, b.end = pos, pos
            pbed.append(b)
        pbed.print_to_file(pointbed)
        beds += [pointbed]
        toclean += [pointbed]

    refinedbed = pf + ".refined.bed"
    FileMerger(beds, outfile=refinedbed).merge()

    # Clean-up
    FileShredder(toclean)

    return refinedbed
Exemple #9
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    fl_accs = opts.fl_accs
    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctid = opts.compreh_pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    cmds = []

    # set PASAHOME env variable if preparing shell script
    if prepare:
        env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME)
        cmds.append(env_cmd)

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        cmds.append(accn_extract_cmd)
        if not prepare:
            sh(accn_extract_cmd)
    else:
        symlink(dnfasta, tfasta)
        transcripts = tfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        ccpus = 16 if cpus >= 16 else cpus
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus)
        if prepare:
            cmds.append(cleancmd)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice), file=aafw)
    aafw.close()

    symlink(genome, gfasta)

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta)
    aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \
             " -t {0}".format(transcripts)
    if fl_accs:
        symlink(fl_accs, flaccs)
        aacmd += " -f {0}".format(flaccs)
    if ggfasta:
        aacmd += " --TDN {0}".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        cmds.append(aacmd)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov)

        if prepare:
            cmds.append(comprehcmd)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)

    if prepare:
        write_file(runfile, "\n".join(cmds))  # initialize run script
Exemple #10
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
             cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Exemple #11
0
def prepare(args):
    """
    %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN-Trinity.

    If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM
    as starting point.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_fastq_names()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]

    paired = opts.paired
    merge = opts.merge
    trinity_home = opts.trinity_home
    hpc_grid_runner_home = opts.hpcgridrunner_home

    method = "DN"
    bam = opts.bam
    if bam and op.exists(bam):
        bam = op.abspath(bam)
        method = "GG"

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    cmds = []

    # set TRINITY_HOME env variable when preparing shell script
    env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home)
    cmds.append(env_cmd)

    if method == "DN":
        assert op.exists("../" + inparam)

        flist = iglob("../" + inparam, opts.names)
        if paired:
            f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x]
            f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x]
            assert len(f1) == len(f2)
            if merge:
                r1, r2 = "left.fastq", "right.fastq"
                reads = ((f1, r1), (f2, r2))
        else:
            if merge:
                r = "single.fastq"
                reads = ((flist, r), )

        if merge:
            for fl, r in reads:
                fm = FileMerger(fl, r)
                fm.merge(checkexists=True)

    cmd = op.join(trinity_home, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)

    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome_guided_bam {0}".format(bam)
        cmd += " --genome_guided_max_intron {0}".format(opts.max_intron)
    else:
        if paired:
            if merge:
                cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
            else:
                cmd += " --left {0}".format(",".join(f1))
                cmd += " --right {0}".format(",".join(f2))
        else:
            if merge:
                 cmd += " --single {0}".format(reads[0][-1])
            else:
                for f in flist:
                    cmd += " --single {0}".format(f)

    if opts.grid and opts.grid_conf_file:
        hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl")
        hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file)
        assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file)

        cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file)

    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmds.append(cmd)

    if opts.cleanup:
        cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \
            if method == "DN" else \
            'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")'
        cmd.append(cleanup_cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(cmds))
    os.chdir(cwd)
Exemple #12
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired",
                 default=False,
                 action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    assert op.exists(inparam)

    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, opts.names)
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(
        opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(
            genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
            cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmd += " --bypass_java_version_check"

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)