Esempio n. 1
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Esempio n. 2
0
def get_cookies(cookies=PHYTOZOME_COOKIES):
    from jcvi.utils.console import console

    # Check if cookies is still good
    if op.exists(cookies) and last_updated(cookies) < 3600:
        return cookies

    if console.is_terminal:
        username = console.input("[bold green]Phytozome Login: "******"[bold green]Phytozome Password: "******"curl")
    if curlcmd is None:
        logging.error("curl command not installed. Aborting.")
        return None
    cmd = "{} https://signon.jgi.doe.gov/signon/create".format(curlcmd)
    cmd += " --data-urlencode 'login={0}' --data-urlencode 'password={1}' -b {2} -c {2}".format(
        username, pw, cookies
    )
    sh(cmd, outfile="/dev/null", errfile="/dev/null", log=False)
    if not op.exists(cookies):
        logging.error("Cookies file `{}` not created. Aborting.".format(cookies))
        return None

    return cookies
Esempio n. 3
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Esempio n. 4
0
def test_get_cookies(mock_username, mock_password):
    from jcvi.apps.fetch import get_cookies, PHYTOZOME_COOKIES
    from jcvi.apps.base import remove_if_exists, which

    remove_if_exists(PHYTOZOME_COOKIES)
    if which("curl"):
        assert get_cookies() == PHYTOZOME_COOKIES
    else:
        assert get_cookies() is None  # errored out with "curl not found"
Esempio n. 5
0
def run_blat(infile=None, outfile=None, db="UniVec_Core", pctid=95, hitlen=50, cpus=16, overwrite=True):

    cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat"
    cmd += " {0} {1} -out=blast8 {2}".format(db, infile, outfile)
    sh(cmd)

    blatfile = outfile
    filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
    run_blast_filter(infile=blatfile, outfile=filtered_blatfile, pctid=pctid, hitlen=hitlen)
    if overwrite:
        shutil.move(filtered_blatfile, blatfile)
Esempio n. 6
0
File: tsp.py Progetto: Hensonmw/jcvi
    def run_concorde(self, tspfile, seed=666):
        outfile = op.join(self.work_dir, "data.sol")
        if op.exists(outfile):
            os.remove(outfile)

        cc = "concorde"
        assert which(cc), "You must install `concorde` on your PATH" + \
                          " [http://www.math.uwaterloo.ca/tsp/concorde.html]"
        cmd = "{0} -s {1} -x -o {2} {3}".format(cc, seed, outfile, tspfile)

        outf = None if self.verbose else "/dev/null"
        retcode = sh(cmd, outfile=outf, errfile=outf)
        return retcode, outfile
Esempio n. 7
0
def run_blat(infile=None, outfile=None, db="UniVec_Core",
             pctid=95, hitlen=50, cpus=16, overwrite=True):

    cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat"
    cmd += ' {0} {1} -out=blast8 {2}'.format(db, infile, outfile)
    sh(cmd)

    blatfile = outfile
    filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
    run_blast_filter(infile=blatfile, outfile=filtered_blatfile,
            pctid=pctid, hitlen=hitlen)
    if overwrite:
        shutil.move(filtered_blatfile, blatfile)
Esempio n. 8
0
    def run_concorde(self, tspfile, seed=666):
        outfile = op.join(self.work_dir, "data.sol")
        if op.exists(outfile):
            os.remove(outfile)

        cc = "concorde"
        assert which(cc), ("You must install `concorde` on your PATH" +
                           " [http://www.math.uwaterloo.ca/tsp/concorde.html]")
        cmd = "{0} -s {1} -x -o {2} {3}".format(cc, seed, outfile, tspfile)

        outf = None if self.verbose else "/dev/null"
        retcode = sh(cmd, outfile=outf, errfile=outf)
        return retcode, outfile
Esempio n. 9
0
def getpath(cmd, name=None, url=None, cfg="~/.jcvirc"):
    """
    Get install locations of common binaries
    First, check ~/.jcvirc file to get the full path
    If not present, ask on the console and and store
    """
    p = which(cmd)  # if in PATH, just returns it
    if p:
        return p

    PATH = "Path"
    config = ConfigParser.RawConfigParser()
    cfg = op.expanduser(cfg)
    changed = False
    if op.exists(cfg):
        config.read(cfg)

    assert name is not None, "Need a program name"

    try:
        fullpath = config.get(PATH, name)
    except ConfigParser.NoSectionError:
        config.add_section(PATH)
        changed = True
    except:
        pass

    try:
        fullpath = config.get(PATH, name)
    except ConfigParser.NoOptionError:
        msg = "=== Configure path for {0} ===\n".format(name, cfg)
        if url:
            msg += "URL: {0}\n".format(url)
        msg += "[Directory that contains `{0}`]: ".format(cmd)
        fullpath = raw_input(msg).strip()
        config.set(PATH, name, fullpath)
        changed = True

    path = op.join(op.expanduser(fullpath), cmd)
    assert is_exe(path), \
            "Cannot execute binary `{0}`. Please verify and rerun.".format(path)

    if changed:
        configfile = open(cfg, "w")
        config.write(configfile)
        logging.debug("Configuration written to `{0}`.".format(cfg))

    return path
Esempio n. 10
0
def getpath(cmd, name=None, url=None, cfg="~/.jcvirc"):
    """
    Get install locations of common binaries
    First, check ~/.jcvirc file to get the full path
    If not present, ask on the console and and store
    """
    p = which(cmd)  # if in PATH, just returns it
    if p:
        return p

    PATH = "Path"
    config = ConfigParser.RawConfigParser()
    cfg = op.expanduser(cfg)
    changed = False
    if op.exists(cfg):
        config.read(cfg)

    assert name is not None, "Need a program name"

    try:
        fullpath = config.get(PATH, name)
    except ConfigParser.NoSectionError:
        config.add_section(PATH)
        changed = True
    except:
        pass

    try:
        fullpath = config.get(PATH, name)
    except ConfigParser.NoOptionError:
        msg = "=== Configure path for {0} ===\n".format(name, cfg)
        if url:
            msg += "URL: {0}\n".format(url)
        msg += "[Directory that contains `{0}`]: ".format(cmd)
        fullpath = raw_input(msg).strip()
        config.set(PATH, name, fullpath)
        changed = True

    path = op.join(op.expanduser(fullpath), cmd)
    assert is_exe(path), \
            "Cannot execute binary `{0}`. Please verify and rerun.".format(path)

    if changed:
        configfile = open(cfg, "w")
        config.write(configfile)
        logging.debug("Configuration written to `{0}`.".format(cfg))

    return path
Esempio n. 11
0
def fromsra(args):
    """
    %prog fromsra srafile

    Convert sra file to fastq using the sratoolkit `fastq-dump`
    """
    p = OptionParser(fromsra.__doc__)
    p.add_option(
        "--paired",
        default=False,
        action="store_true",
        help="Specify if library layout is paired-end",
    )
    p.add_option(
        "--compress",
        default=None,
        choices=["gzip", "bzip2"],
        help="Compress output fastq files",
    )
    p.set_outdir()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (srafile, ) = args
    paired = opts.paired
    compress = opts.compress
    outdir = opts.outdir

    script_path = which("fastq-dump")
    if not script_path:
        logging.error("Cannot find `fastq-dump` in the PATH")
        sys.exit()

    cmd = [script_path]
    if compress:
        cmd.append("--{0}".format(compress))
    if paired:
        cmd.append("--split-files")
    if outdir:
        cmd.append("--outdir {0}".format(outdir))
    cmd.append(srafile)

    outcmd = " ".join(cmd)
    sh(outcmd, grid=opts.grid)
Esempio n. 12
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print >> fw, "\t".join((k, str(size)))
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Esempio n. 13
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print("\t".join((k, str(size))), file=fw)
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Esempio n. 14
0
def fromsra(args):
    """
    %prog fromsra srafile

    Convert sra file to fastq using the sratoolkit `fastq-dump`
    """
    p = OptionParser(fromsra.__doc__)
    p.add_option(
        "--paired",
        default=False,
        action="store_true",
        help="Specify if library layout is paired-end " + "[default: %default]",
    )
    p.add_option(
        "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]"
    )
    p.set_outdir()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    srafile, = args
    paired = opts.paired
    compress = opts.compress
    outdir = opts.outdir

    script_path = which("fastq-dump")
    if not script_path:
        logging.error("Cannot find `fastq-dump` in the PATH")
        sys.exit()

    cmd = [script_path]
    if compress:
        cmd.append("--{0}".format(compress))
    if paired:
        cmd.append("--split-3")
    if outdir:
        cmd.append("--outdir {0}".format(outdir))
    cmd.append(srafile)

    outcmd = " ".join(cmd)
    sh(outcmd, grid=opts.grid)
Esempio n. 15
0
def get_cookies(cookies=PHYTOZOME_COOKIES):
    from getpass import getpass

    # Check if cookies is still good
    if op.exists(cookies) and last_updated(cookies) < 3600:
        return cookies

    username = input("Phytozome Login: "******"Phytozome Password: "******"curl")
    if curlcmd is None:
        print("curl command not installed. Aborting.", file=sys.stderr)
        return None
    cmd = "{} https://signon.jgi.doe.gov/signon/create".format(curlcmd)
    cmd += " --data-urlencode 'login={0}' --data-urlencode 'password={1}' -b {2} -c {2}".format(
        username, pw, cookies)
    sh(cmd, outfile="/dev/null", errfile="/dev/null", log=False)
    if not op.exists(cookies):
        print("Cookies file `{}` not created. Aborting.".format(cookies),
              file=sys.stderr)
        return None

    return cookies
Esempio n. 16
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta",
                 default=False,
                 action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual",
                 default=False,
                 action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append(
        'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Esempio n. 17
0
def compare(args):
    """
    %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff]

    Run the PASA annotation comparison pipeline

    If annotation.gff file is provided, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, transcripts, = args[:3]
    annotation = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    acfw = must_open(acconf, "w")
    print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons)
    acfw.close()

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, genome, transcripts, opts.genetic_code)
    if annotation:
        accmd += " -L --annots_gff3 {0}".format(annotation)
    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Esempio n. 18
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Esempio n. 19
0
def is_usetex():
    """Check if latex command is available
    """
    return bool(which("latex")) and bool(which("lp"))
Esempio n. 20
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cpus = opts.cpus
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    setup = "source {0}".format(cmd)
    if not which("fakeQuals.py"):
        sh(setup)

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    cwd = os.getcwd()
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    if not op.exists(reference):
        sh("mkdir -p {0}".format(dref))
        sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    if not op.exists(reads):
        sh("mkdir -p {0}".format(dreads))
        sh("cp {0} {1}/".format(readsfiles, dreads))

    outputDir = cwd
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus))
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents)
Esempio n. 21
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    fl_accs = opts.fl_accs
    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctid = opts.compreh_pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    cmds = []

    # set PASAHOME env variable if preparing shell script
    if prepare:
        env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME)
        cmds.append(env_cmd)

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        cmds.append(accn_extract_cmd)
        if not prepare:
            sh(accn_extract_cmd)
    else:
        symlink(dnfasta, tfasta)
        transcripts = tfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        ccpus = 16 if cpus >= 16 else cpus
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus)
        if prepare:
            cmds.append(cleancmd)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice), file=aafw)
    aafw.close()

    symlink(genome, gfasta)

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta)
    aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \
             " -t {0}".format(transcripts)
    if fl_accs:
        symlink(fl_accs, flaccs)
        aacmd += " -f {0}".format(flaccs)
    if ggfasta:
        aacmd += " --TDN {0}".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        cmds.append(aacmd)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov)

        if prepare:
            cmds.append(comprehcmd)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)

    if prepare:
        write_file(runfile, "\n".join(cmds))  # initialize run script
Esempio n. 22
0
def compare(args):
    """
    %prog compare pasa_db_name [--annots_gff3=annotation.gff3]

    Run the PASA annotation comparison pipeline

    This assumes that PASA alignment assembly has alredy been completed and
    run directory contains `genome.fasta` and `transcript.fasta` files.

    If `--annots_gff3` is specified, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    pasa_db, = args

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    annots_gff3 = opts.annots_gff3
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "", append=True, skipcheck=True)  # initialize run script

    acfw = must_open(acconf, "w")
    print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw)
    acfw.close()

    if not op.exists(gfasta):
        sys.exit("Genome fasta file `{0}` does not exist".format(gfasta))

    transcripts = tfasta
    if not op.exists(transcripts):
        sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts))

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, gfasta, transcripts, opts.genetic_code)

    if annots_gff3:
        if not op.exists(annots_gff3):
            sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3))
        symlink(annots_gff3, annotation)
        accmd += " -L --annots_gff3 {0}".format(annotation)

    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Esempio n. 23
0
File: pasa.py Progetto: BrokeW/jcvi
def compare(args):
    """
    %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff]

    Run the PASA annotation comparison pipeline

    If annotation.gff file is provided, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option(
        "--prepare",
        default=False,
        action="store_true",
        help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, transcripts, = args[:3]
    annotation = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error(
            "PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    acfw = must_open(acconf, "w")
    print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons)
    acfw.close()

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, genome, transcripts, opts.genetic_code)
    if annotation:
        accmd += " -L --annots_gff3 {0}".format(annotation)
    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Esempio n. 24
0
File: pasa.py Progetto: yangjl/jcvi
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided,
    the PASA Comprehensive Transcriptome protocol is followed
    <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_home("pasa")
    p.set_align(pctid=95, pctcov=90, intron=2000, bpsplice=3, compreh_pctcov=30)
    p.add_option("--aligners", default="blat,gmap",
            help="Specify splice aligners to use for mapping [default: %default]")
    p.add_option("--clean", default=False, action="store_true",
            help="Clean transcripts using tgi seqclean [default: %default]")
    p.set_cpus()
    p.set_grid()
    p.set_grid_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = which("seqclean")
    if clean and not seqclean:
        logging.error("Cannot find tgi seqclean in PATH")
        sys.exit()

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", "build_comprehensive_transcriptome.dbi"))

    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    mkdir(pasa_db)
    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        write_file(runfile, accn_extract_cmd, append=True) \
                if prepare else sh(accn_extract_cmd)
    else:
        transcripts = dnfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus)
        if prepare:
            write_file(runfile, cleancmd, append=True)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), pctcov, pctid, bpsplice)
    aafw.close()

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome)
    aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \
             " -t {0} ".format(transcripts)
    if ggfasta:
        aacmd += " --TDN {0} ".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1}".format(",".join(aligners), opts.intron)

    if prepare:
        write_file(runfile, aacmd, append=True)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += "--min_per_ID {0} --min_per_aligned {1}".format(pctid, pctcov)

        if prepare:
            write_file(runfile, comprehcmd, append=True)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
Esempio n. 25
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents, meta="run script")