Ejemplo n.º 1
0
def merge(args):
    """
    %prog merge ref.fasta query.fasta *.delta

    Merge delta files into a single delta.
    """
    p = OptionParser(merge.__doc__)
    p.set_outfile(outfile="merged_results.delta")
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, query = args[:2]
    deltafiles = args[2:]
    outfile = opts.outfile

    ref = get_abs_path(ref)
    query = get_abs_path(query)
    fw = must_open(outfile, "w")
    print >> fw, " ".join((ref, query))
    print >> fw, "NUCMER"
    fw.close()

    for d in deltafiles:
        cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d)
        sh(cmd, outfile=outfile, append=True)
Ejemplo n.º 2
0
def merge(args):
    """
    %prog merge ref.fasta query.fasta *.delta

    Merge delta files into a single delta.
    """
    p = OptionParser(merge.__doc__)
    p.set_outfile(outfile="merged_results.delta")
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, query = args[:2]
    deltafiles = args[2:]
    outfile = opts.outfile

    ref = get_abs_path(ref)
    query = get_abs_path(query)
    fw = must_open(outfile, "w")
    print(" ".join((ref, query)), file=fw)
    print("NUCMER", file=fw)
    fw.close()

    for d in deltafiles:
        cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d)
        sh(cmd, outfile=outfile, append=True)
Ejemplo n.º 3
0
def size(args):
    """
    find folder -type l | %prog size

    Get the size for all the paths that are pointed by the links
    """
    from jcvi.utils.cbook import human_size

    p = OptionParser(size.__doc__)
    fp = sys.stdin

    results = []
    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue

        source = get_abs_path(link_name)

        link_name = op.basename(link_name)
        filesize = op.getsize(source)
        results.append((filesize, link_name))

    # sort by descending file size
    for filesize, link_name in sorted(results, reverse=True):
        filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True)
        print >>sys.stderr, "%10s\t%s" % (filesize, link_name)
Ejemplo n.º 4
0
def link(args):
    """
    %prog link metafile

    Link source to target based on a tabular file.
    """
    from jcvi.apps.base import mkdir

    p = OptionParser(link.__doc__)
    p.add_option("--dir",
                 help="Place links in a subdirectory [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    meta, = args
    d = opts.dir
    if d:
        mkdir(d)

    fp = open(meta)
    for row in fp:
        source, target = row.split()
        source = get_abs_path(source)
        if d:
            target = op.join(d, target)
        lnsf(source, target, log=True)
Ejemplo n.º 5
0
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \
        pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16):

    assert db, "Need to specify database fasta file."

    db = get_abs_path(db)
    nin = db + ".nin"
    nin00 = db + ".00.nin"
    nin = nin00 if op.exists(nin00) else (db + ".nin")
    run_formatdb(infile=db, outfile=nin)

    cmd = "blastn"
    cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile)
    cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus)
    cmd += " -task {0}".format(task)
    if wordsize:
        cmd += " -word_size {0}".format(wordsize)
    if pctid:
        cmd += " -perc_identity {0}".format(pctid)
    if best:
        cmd += " -max_target_seqs {0}".format(best)
    sh(cmd)

    if pctid and hitlen:
        blastfile = outfile
        filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
        run_blast_filter(infile=blastfile, outfile=filtered_blastfile,
                pctid=pctid, hitlen=hitlen)
        shutil.move(filtered_blastfile, blastfile)
Ejemplo n.º 6
0
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \
        pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16):

    assert db, "Need to specify database fasta file."

    db = get_abs_path(db)
    nin = db + ".nin"
    nin00 = db + ".00.nin"
    nin = nin00 if op.exists(nin00) else (db + ".nin")
    run_formatdb(infile=db, outfile=nin)

    cmd = "blastn"
    cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile)
    cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus)
    cmd += " -task {0}".format(task)
    if wordsize:
        cmd += " -word_size {0}".format(wordsize)
    if pctid:
        cmd += " -perc_identity {0}".format(pctid)
    if best:
        cmd += " -max_target_seqs {0}".format(best)
    sh(cmd)

    if pctid and hitlen:
        blastfile = outfile
        filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
        run_blast_filter(infile=blastfile,
                         outfile=filtered_blastfile,
                         pctid=pctid,
                         hitlen=hitlen)
        shutil.move(filtered_blastfile, blastfile)
Ejemplo n.º 7
0
def contamination(args):
    """
    %prog contamination folder Ecoli.fasta

    Remove contaminated reads. The FASTQ files in the folder will automatically
    pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2.
    """
    from jcvi.apps.bowtie import align

    p = OptionParser(contamination.__doc__)
    p.add_option("--mapped", default=False, action="store_true",
                 help="Retain contaminated reads instead [default: %default]")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, ecoli = args
    ecoli = get_abs_path(ecoli)
    tag = "--mapped" if opts.mapped else "--unmapped"
    for p, pf in iter_project(folder, 2):
        align_opts = [ecoli] + p + [tag]
        align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"]
        if opts.mateorientation:
            align_opts += ["--mateorientation={0}".format(opts.mateorientation)]
        samfile, logfile = align(align_opts)
Ejemplo n.º 8
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder):
        samplefq = []
        for i in range(2):
            samplefq.append(
                op.join(work, prefix + "_{0}.first.fastq".format(i + 1)))
            first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]])

        os.chdir(work)
        align_args = [ref] + [op.basename(fq) for fq in samplefq]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""),
                                            i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Ejemplo n.º 9
0
def size(args):
    """
    find folder -type l | %prog size

    Get the size for all the paths that are pointed by the links
    """
    from jcvi.utils.cbook import human_size

    p = OptionParser(size.__doc__)
    fp = sys.stdin

    results = []
    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue

        source = get_abs_path(link_name)

        link_name = op.basename(link_name)
        filesize = op.getsize(source)
        results.append((filesize, link_name))

    # sort by descending file size
    for filesize, link_name in sorted(results, reverse=True):
        filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True)
        print("%10s\t%s" % (filesize, link_name), file=sys.stderr)
Ejemplo n.º 10
0
def contamination(args):
    """
    %prog contamination folder Ecoli.fasta

    Remove contaminated reads. The FASTQ files in the folder will automatically
    pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2.
    """
    from jcvi.apps.bowtie import align

    p = OptionParser(contamination.__doc__)
    p.add_option("--mapped",
                 default=False,
                 action="store_true",
                 help="Retain contaminated reads instead [default: %default]")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, ecoli = args
    ecoli = get_abs_path(ecoli)
    tag = "--mapped" if opts.mapped else "--unmapped"
    for p, pf in iter_project(folder, 2):
        align_opts = [ecoli] + p + [tag]
        align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"]
        if opts.mateorientation:
            align_opts += [
                "--mateorientation={0}".format(opts.mateorientation)
            ]
        samfile, logfile = align(align_opts)
Ejemplo n.º 11
0
def link(args):
    """
    %prog link metafile

    Link source to target based on a tabular file.
    """
    from jcvi.apps.base import mkdir

    p = OptionParser(link.__doc__)
    p.add_option("--dir", help="Place links in a subdirectory")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (meta, ) = args
    d = opts.dir
    if d:
        mkdir(d)

    fp = open(meta)
    cwd = op.dirname(get_abs_path(meta))
    for row in fp:
        source, target = row.split()
        source = op.join(cwd, source)
        if d:
            target = op.join(d, target)
        lnsf(source, target, log=True)
Ejemplo n.º 12
0
 def update_abs_path(self):
     for r in self:
         path = r.value
         if path and op.exists(path):
             npath = get_abs_path(path)
             logging.debug("{0}={1} => {2}".format(r.tag, path, npath))
             r.value = npath
Ejemplo n.º 13
0
 def update_abs_path(self):
     for r in self:
         path = r.value
         if path and op.exists(path):
             npath = get_abs_path(path)
             logging.debug("{0}={1} => {2}".format(r.tag, path, npath))
             r.value = npath
Ejemplo n.º 14
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including CLC, BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    if aligner == "clc":
        from jcvi.apps.clc import align
        from jcvi.formats.cas import pairs as ps
    else:
        from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder, 2):
        samplefq = op.join(work, prefix + ".first.fastq")
        first([str(opts.firstN)] + p + ["-o", samplefq])

        os.chdir(work)
        align_args = [ref, op.basename(samplefq)]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Ejemplo n.º 15
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads).
    """
    valid_modes = ("bwasw", "aln", "mem")
    p = OptionParser(align.__doc__)
    p.add_option("--mode",
                 default="mem",
                 choices=valid_modes,
                 help="BWA mode [default: %default]")
    p.add_option("--readtype",
                 choices=("pacbio", "pbread"),
                 help="Read type in bwa-mem")
    p.set_cutoff(cutoff=800)
    p.set_sam_options()

    opts, args = p.parse_args(args)
    mode = opts.mode
    nargs = len(args)

    if nargs not in (2, 3):
        sys.exit(not p.print_help())

    tag = "bwa-{0}: ".format(mode)
    c = mem
    if nargs == 2:
        tag += "Single-end alignment"
        if mode == "bwasw":
            c = bwasw
        elif mode == "aln":
            c = samse
    else:
        assert mode != "bwasw", "Cannot use --bwasw with paired-end mode"
        tag += "Paired-end alignment"
        if mode == "aln":
            c = sampe

    logging.debug(tag)
    args[0] = get_abs_path(args[0])
    cmd, samfile = c(args, opts)
    if cmd:
        cmd = output_bam(cmd, samfile)

    bam = opts.bam
    unmapped = opts.unmapped

    sh(cmd)
    if unmapped:
        dbfile, readfile = args[:2]
        mopts = [samfile, "--unmapped"]
        if not bam:
            mopts += ["--sam"]
        mapped(mopts)
        FileShredder([samfile])

    return samfile, None
Ejemplo n.º 16
0
    def make_link(self, firstN=0):
        mkdir(self.genome)
        if firstN > 0:
            first([str(firstN), self.fastq, "--outfile={0}".format(self.link)])
            return

        if op.islink(self.link):
            os.unlink(self.link)
        os.symlink(get_abs_path(self.fastq), self.link)
Ejemplo n.º 17
0
    def make_link(self, firstN=0):
        mkdir(self.genome)
        if firstN > 0:
            first([str(firstN), self.fastq, "--outfile={0}".format(self.link)])
            return

        if op.islink(self.link):
            os.unlink(self.link)
        os.symlink(get_abs_path(self.fastq), self.link)
Ejemplo n.º 18
0
Archivo: ca.py Proyecto: zjwang6/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie",
        dest="outtie",
        default=False,
        action="store_true",
        help="Are these outie reads?",
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug(
            "[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (
            1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
Ejemplo n.º 19
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    safile = dbfile + ".1.bt2"
    if need_update(dbfile, safile):
        cmd = "bowtie2-build {0} {0}".format(dbfile)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `bowtie2-build` already run.".format(safile))

    return dbfile
Ejemplo n.º 20
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    safile = dbfile + ".sa"
    if not op.exists(safile):
        cmd = "bwa index {0}".format(dbfile)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `bwa index` already run.".format(safile))

    return dbfile
Ejemplo n.º 21
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    safile = dbfile + ".sa"
    if need_update(dbfile, safile):
        cmd = "bwa index {0}".format(dbfile)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `bwa index` already run.".format(safile))

    return dbfile
Ejemplo n.º 22
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    safile = dbfile + ".1.bt2"
    if need_update(dbfile, safile):
        cmd = "bowtie2-build {0} {0}".format(dbfile)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `bowtie2-build` already run.".format(safile))

    return dbfile
Ejemplo n.º 23
0
Archivo: ca.py Proyecto: arvin580/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]"
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug("[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
Ejemplo n.º 24
0
def run_vecscreen(infile=None, outfile=None, db="UniVec_Core", pctid=None, hitlen=None):
    """
    BLASTN parameters reference:
    http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html
    """
    db = get_abs_path(db)
    nin = db + ".nin"
    run_formatdb(infile=db, outfile=nin)

    cmd = "blastn"
    cmd += " -task blastn"
    cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile)
    cmd += " -penalty -5 -gapopen 4 -gapextend 4 -dust yes -soft_masking true"
    cmd += " -searchsp 1750000000000 -evalue 0.01 -outfmt 6 -num_threads 8"
    sh(cmd)
Ejemplo n.º 25
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bowtie2-build`. Same interface.
    """
    p = OptionParser(index.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    dbfile, = args
    dbfile = get_abs_path(dbfile)
    check_index(dbfile)
Ejemplo n.º 26
0
def index(args):
    """
    %prog index database.fasta

    Wrapper for `bowtie2-build`. Same interface.
    """
    p = OptionParser(index.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    dbfile, = args
    dbfile = get_abs_path(dbfile)
    check_index(dbfile)
Ejemplo n.º 27
0
def run_vecscreen(infile=None, outfile=None, db="UniVec_Core", pctid=None, hitlen=None):
    """
    BLASTN parameters reference:
    http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html
    """
    db = get_abs_path(db)
    nin = db + ".nin"
    run_formatdb(infile=db, outfile=nin)

    cmd = "blastn"
    cmd += " -task blastn"
    cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile)
    cmd += " -penalty -5 -gapopen 4 -gapextend 4 -dust yes -soft_masking true"
    cmd += " -searchsp 1750000000000 -evalue 0.01 -outfmt 6 -num_threads 8"
    sh(cmd)
Ejemplo n.º 28
0
Archivo: gmap.py Proyecto: fw1121/jcvi
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    dbdir, filename = op.split(dbfile)
    if not dbdir:
        dbdir = "."
    dbname = filename.rsplit(".", 1)[0]
    safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname))
    if dbname == filename:
        dbname = filename + ".db"
    if need_update(dbfile, safile):
        cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `gmap_build` already run.".format(safile))

    return dbdir, dbname
Ejemplo n.º 29
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    dbdir, filename = op.split(dbfile)
    if not dbdir:
        dbdir = "."
    dbname = filename.rsplit(".", 1)[0]
    safile = op.join(dbdir, "{0}/{0}.salcpchilddc".format(dbname))
    if dbname == filename:
        dbname = filename + ".db"
    if need_update(dbfile, safile):
        cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `gmap_build` already run.".format(safile))

    return dbdir, dbname
Ejemplo n.º 30
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print("\t".join((k, str(size))), file=fw)
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Ejemplo n.º 31
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print >> fw, "\t".join((k, str(size)))
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Ejemplo n.º 32
0
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Ejemplo n.º 33
0
Archivo: sam.py Proyecto: arvin580/jcvi
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Ejemplo n.º 34
0
def cp(args):
    """
    find folder -type l | %prog cp

    Copy all the softlinks to the current folder, using absolute paths
    """
    p = OptionParser(cp.__doc__)
    fp = sys.stdin

    for link_name in fp:
        link_name = link_name.strip()
        if not op.exists(link_name):
            continue

        source = get_abs_path(link_name)
        link_name = op.basename(link_name)
        if not op.exists(link_name):
            os.symlink(source, link_name)
        logging.debug(" => ".join((source, link_name)))
Ejemplo n.º 35
0
def cp(args):
    """
    find folder -type l | %prog cp

    Copy all the softlinks to the current folder, using absolute paths
    """
    p = OptionParser(cp.__doc__)
    fp = sys.stdin

    for link_name in fp:
        link_name = link_name.strip()
        if not op.exists(link_name):
            continue

        source = get_abs_path(link_name)
        link_name = op.basename(link_name)
        if not op.exists(link_name):
            os.symlink(source, link_name)
        logging.debug(" => ".join((source, link_name)))
Ejemplo n.º 36
0
def touch(args):
    """
    find . -type l | %prog touch

    Linux commands `touch` wouldn't modify mtime for links, this script can.
    Use find to pipe in all the symlinks.
    """
    p = OptionParser(touch.__doc__)
    opts, args = p.parse_args(args)
    fp = sys.stdin

    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue
        if not op.exists(link_name):
            continue

        source = get_abs_path(link_name)
        lnsf(source, link_name)
Ejemplo n.º 37
0
def touch(args):
    """
    find . -type l | %prog touch

    Linux commands `touch` wouldn't modify mtime for links, this script can.
    Use find to pipe in all the symlinks.
    """
    p = OptionParser(touch.__doc__)
    opts, args = p.parse_args(args)
    fp = sys.stdin

    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue
        if not op.exists(link_name):
            continue

        source = get_abs_path(link_name)
        lnsf(source, link_name)
Ejemplo n.º 38
0
def soapX(args):
    """
    %prog soapX folder tag [*.fastq]

    Run SOAP on a folder of paired reads and apply tag before assembly.
    Optional *.fastq in the argument list will be symlinked in each folder and
    co-assembled.
    """
    p = OptionParser(soapX.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    folder, tag = args[:2]
    extra = args[2:]
    extra = [get_abs_path(x) for x in extra]
    tag = tag.split(",")
    for p, pf in iter_project(folder, n=3):
        soap_trios(p, pf, tag, extra)
Ejemplo n.º 39
0
def soapX(args):
    """
    %prog soapX folder tag [*.fastq]

    Run SOAP on a folder of paired reads and apply tag before assembly.
    Optional *.fastq in the argument list will be symlinked in each folder and
    co-assembled.
    """
    p = OptionParser(soapX.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    folder, tag = args[:2]
    extra = args[2:]
    extra = [get_abs_path(x) for x in extra]
    tag = tag.split(",")
    for p, pf in iter_project(folder, n=3):
        soap_trios(p, pf, tag, extra)
Ejemplo n.º 40
0
def check_index(dbfile, supercat=False, go=True):
    if supercat:
        updated = False
        pf = dbfile.rsplit(".", 1)[0]
        supercatfile = pf + ".supercat"
        coordsfile = supercatfile + ".coords"
        if go and need_update(dbfile, supercatfile):
            cmd = "tGBS-Generate_Pseudo_Genome.pl"
            cmd += " -f {0} -o {1}".format(dbfile, supercatfile)
            sh(cmd)
            # Rename .coords file since gmap_build will overwrite it
            coordsbak = backup(coordsfile)
            updated = True
        dbfile = supercatfile + ".fasta"

    dbfile = get_abs_path(dbfile)
    dbdir, filename = op.split(dbfile)
    if not dbdir:
        dbdir = "."
    dbname = filename.rsplit(".", 1)[0]
    safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname))
    if dbname == filename:
        dbname = filename + ".db"

    if not go:
        return dbdir, dbname

    if need_update(dbfile, safile):
        cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `gmap_build` already run.".format(safile))

    if go and supercat and updated:
        sh("mv {0} {1}".format(coordsbak, coordsfile))

    return dbdir, dbname
Ejemplo n.º 41
0
def prepare(args):
    """
    %prog prepare barcode_key.csv reference.fasta

    Prepare TASSEL pipeline.
    """
    valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \
                    "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \
                    "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \
                    "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|")
    p = OptionParser(prepare.__doc__)
    p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes,
                 help="Restriction enzyme used [default: %default]")
    p.set_home("tassel")
    p.set_aligner(aligner="bwa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    barcode, reference = args
    thome = opts.tassel_home
    reference = get_abs_path(reference)
    folders = ("fastq", "tagCounts", "mergedTagCounts", "topm",
               "tbt", "mergedTBT", "hapmap", "hapmap/raw",
               "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec")
    for f in folders:
        mkdir(f)

    # Build the pipeline
    runsh = []
    o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme)
    cmd = run_pipeline(thome, "FastqToTagCountPlugin", o)
    runsh.append(cmd)

    o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt"
    o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq"
    cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o)
    runsh.append(cmd)
    runsh.append("cd mergedTagCounts")

    cmd = "python -m jcvi.apps.{0} align --cpus {1}".\
                format(opts.aligner, opts.cpus)
    cmd += " {0} myMasterTags.cnt.fq".format(reference)
    runsh.append(cmd)
    runsh.append("cd ..")

    o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm"
    cmd = run_pipeline(thome, "SAMConverterPlugin", o)
    runsh.append(cmd)

    o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm"
    o += " -mUpd topm/myMasterTagsWithVariants.topm"
    o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000"
    o += " -ref {0} -sC 1 -eC 10".format(reference)
    cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10"
    cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt"
    o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10"
    #o += "-hLD -mnR2 0.2 -mnBonP 0.005"
    cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o)
    runsh.append(cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(runsh))
Ejemplo n.º 42
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile,
                    outfile=outfile,
                    errfile=outfile,
                    arr=ncmds,
                    grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Ejemplo n.º 43
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for `bowtie2` single-end or paired-end, depending on the number of args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.set_firstN(firstN=0)
    p.add_option("--full", default=False, action="store_true",
                 help="Enforce end-to-end alignment [default: local]")
    p.add_option("--reorder", default=False, action="store_true",
                 help="Keep the input read order [default: %default]")
    p.add_option("--null", default=False, action="store_true",
                 help="Do not write to SAM/BAM output")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    p.set_sam_options(bowtie=True)

    opts, args = p.parse_args(args)
    extra = opts.extra
    mo = opts.mateorientation
    if mo == '+-':
        extra += ""
    elif mo == '-+':
        extra += "--rf"
    else:
        extra += "--ff"

    PE = True
    if len(args) == 2:
        logging.debug("Single-end alignment")
        PE = False
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    firstN = opts.firstN
    mapped = opts.mapped
    unmapped = opts.unmapped
    gl = "--end-to-end" if opts.full else "--local"

    dbfile, readfile = args[0:2]
    dbfile = get_abs_path(dbfile)
    safile = check_index(dbfile)
    prefix = get_prefix(readfile, dbfile)
    samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True,
                                            mapped=mapped, unmapped=unmapped,
                                            bam=opts.bam)
    logfile = prefix + ".log"
    offset = guessoffset([readfile])

    if not need_update(safile, samfile):
        logging.error("`{0}` exists. `bowtie2` already run.".format(samfile))
        return samfile, logfile

    cmd = "bowtie2 -x {0}".format(dbfile)
    if PE:
        r1, r2 = args[1:3]
        cmd += " -1 {0} -2 {1}".format(r1, r2)
        cmd += " --maxins {0}".format(opts.cutoff)
        mtag, utag = "--al-conc", "--un-conc"
    else:
        cmd += " -U {0}".format(readfile)
        mtag, utag = "--al", "--un"

    if mapped:
        cmd += " {0} {1}".format(mtag, mapped)
    if unmapped:
        cmd += " {0} {1}".format(utag, unmapped)

    if firstN:
        cmd += " --upto {0}".format(firstN)
    cmd += " -p {0}".format(opts.cpus)
    cmd += " --phred{0}".format(offset)
    cmd += " {0}".format(gl)
    if opts.reorder:
        cmd += " --reorder"

    cmd += " {0}".format(extra)
    # Finally the log
    cmd += " 2> {0}".format(logfile)

    if opts.null:
        samfile = "/dev/null"

    cmd = output_bam(cmd, samfile)
    sh(cmd)
    print >> sys.stderr, open(logfile).read()

    return samfile, logfile
Ejemplo n.º 44
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile, outfile=outfile, errfile=outfile,
                    arr=ncmds, grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Ejemplo n.º 45
0
def prepare(args):
    """
    %prog prepare barcode_key.csv reference.fasta

    Prepare TASSEL pipeline.
    """
    valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \
                    "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \
                    "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \
                    "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|")
    p = OptionParser(prepare.__doc__)
    p.add_option("--enzyme",
                 default="ApeKI",
                 choices=valid_enzymes,
                 help="Restriction enzyme used [default: %default]")
    p.set_home("tassel")
    p.set_aligner(aligner="bwa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    barcode, reference = args
    thome = opts.tassel_home
    reference = get_abs_path(reference)
    folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt",
               "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs",
               "hapmap/filt", "hapmap/bpec")
    for f in folders:
        mkdir(f)

    # Build the pipeline
    runsh = []
    o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme)
    cmd = run_pipeline(thome, "FastqToTagCountPlugin", o)
    runsh.append(cmd)

    o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt"
    o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq"
    cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o)
    runsh.append(cmd)
    runsh.append("cd mergedTagCounts")

    cmd = "python -m jcvi.apps.{0} align --cpus {1}".\
                format(opts.aligner, opts.cpus)
    cmd += " {0} myMasterTags.cnt.fq".format(reference)
    runsh.append(cmd)
    runsh.append("cd ..")

    o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm"
    cmd = run_pipeline(thome, "SAMConverterPlugin", o)
    runsh.append(cmd)

    o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm"
    o += " -mUpd topm/myMasterTagsWithVariants.topm"
    o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000"
    o += " -ref {0} -sC 1 -eC 10".format(reference)
    cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt"
    o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10"
    cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o)
    runsh.append(cmd)

    o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt"
    o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt"
    o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10"
    #o += "-hLD -mnR2 0.2 -mnBonP 0.005"
    cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o)
    runsh.append(cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(runsh), meta="run script")
Ejemplo n.º 46
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for `bowtie2` single-end or paired-end, depending on the number of args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.set_firstN(firstN=0)
    p.add_option("--full",
                 default=False,
                 action="store_true",
                 help="Enforce end-to-end alignment [default: local]")
    p.add_option("--reorder",
                 default=False,
                 action="store_true",
                 help="Keep the input read order [default: %default]")
    p.add_option("--null",
                 default=False,
                 action="store_true",
                 help="Do not write to SAM/BAM output")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    p.set_sam_options(bowtie=True)

    opts, args = p.parse_args(args)
    extra = opts.extra
    mo = opts.mateorientation
    if mo == '+-':
        extra += ""
    elif mo == '-+':
        extra += "--rf"
    else:
        extra += "--ff"

    PE = True
    if len(args) == 2:
        logging.debug("Single-end alignment")
        PE = False
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    firstN = opts.firstN
    mapped = opts.mapped
    unmapped = opts.unmapped
    gl = "--end-to-end" if opts.full else "--local"

    dbfile, readfile = args[0:2]
    dbfile = get_abs_path(dbfile)
    safile = check_index(dbfile)
    prefix = get_prefix(readfile, dbfile)
    samfile, mapped, unmapped = get_samfile(readfile,
                                            dbfile,
                                            bowtie=True,
                                            mapped=mapped,
                                            unmapped=unmapped,
                                            bam=opts.bam)
    logfile = prefix + ".log"
    offset = guessoffset([readfile])

    if not need_update(safile, samfile):
        logging.error("`{0}` exists. `bowtie2` already run.".format(samfile))
        return samfile, logfile

    cmd = "bowtie2 -x {0}".format(dbfile)
    if PE:
        r1, r2 = args[1:3]
        cmd += " -1 {0} -2 {1}".format(r1, r2)
        cmd += " --maxins {0}".format(opts.cutoff)
        mtag, utag = "--al-conc", "--un-conc"
    else:
        cmd += " -U {0}".format(readfile)
        mtag, utag = "--al", "--un"

    if mapped:
        cmd += " {0} {1}".format(mtag, mapped)
    if unmapped:
        cmd += " {0} {1}".format(utag, unmapped)

    if firstN:
        cmd += " --upto {0}".format(firstN)
    cmd += " -p {0}".format(opts.cpus)
    cmd += " --phred{0}".format(offset)
    cmd += " {0}".format(gl)
    if opts.reorder:
        cmd += " --reorder"

    cmd += " {0}".format(extra)
    # Finally the log
    cmd += " 2> {0}".format(logfile)

    if opts.null:
        samfile = "/dev/null"

    cmd = output_bam(cmd, samfile)
    sh(cmd)
    print >> sys.stderr, open(logfile).read()

    return samfile, logfile