Example #1
0
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print("\n".join(kmers), file=proc.stdin)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug(
        "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format(
            K, fastafile, jfdb, binfile
        )
    )
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
Example #2
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    p.add_option("--prepend", help="Prepend string to read names")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bamfile,) = args
    prepend = opts.prepend

    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == "@":
            print(row.strip(), file=p.stdin)
        else:
            s = SamLine(row)
            if prepend:
                s.qname = prepend + "_" + s.qname
            else:
                s.update_readname()
            print(s, file=p.stdin)
Example #3
0
def append(args):
    """
    %prog append bamfile

    Append /1 or /2 to read names. Useful for using the Tophat2 bam file for
    training AUGUSTUS gene models.
    """
    p = OptionParser(append.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    icmd = "samtools view -h {0}".format(bamfile)
    bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam"
    ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile)
    p = Popen(ocmd, stdin=PIPE)
    for row in popen(icmd):
        if row[0] == '@':
            print >> p.stdin, row.strip()
        else:
            s = SamLine(row)
            s.update_readname()
            print >> p.stdin, s
Example #4
0
def lastz_2bit(t):
    """
    Used for formats other than BLAST, i.e. lav, maf, etc. which requires the
    database file to contain a single FASTA record.
    """
    bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format = t

    ref_tags = [Darkspace]
    qry_tags = [Darkspace]
    ref_tags, qry_tags = add_mask(ref_tags, qry_tags, mask=mask)

    lastz_cmd = Lastz_template.format(lastz_bin, bfasta_fn, ref_tags, \
                                                 afasta_fn, qry_tags)
    if extra:
        lastz_cmd += " " + extra.strip()

    lastz_cmd += " --format={0}".format(format)
    proc = Popen(lastz_cmd)
    out_fh = open(outfile, "w")

    logging.debug("job <%d> started: %s" % (proc.pid, lastz_cmd))
    for row in proc.stdout:
        out_fh.write(row)
        out_fh.flush()
    logging.debug("job <%d> finished" % proc.pid)
Example #5
0
def lastz(k, n, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra,
          mask=False):

    ref_tags = [Multiple, Darkspace]
    qry_tags = [Darkspace]
    if n != 1:
        qry_tags.append(Subsample.format(k, n))

    ref_tags, qry_tags = add_mask(ref_tags, qry_tags, mask=mask)

    lastz_cmd = Lastz_template.format(lastz_bin, bfasta_fn, ref_tags, \
                                                 afasta_fn, qry_tags)
    if extra:
        lastz_cmd += " " + extra.strip()

    lastz_cmd += " --format=general-:%s" % lastz_fields
    # The above conversion is no longer necessary after LASTZ v1.02.40
    # (of which I contributed a patch)
    #lastz_cmd += " --format=BLASTN-"

    proc = Popen(lastz_cmd)

    logging.debug("job <%d> started: %s" % (proc.pid, lastz_cmd))
    for row in proc.stdout:
        row = lastz_to_blast(row)
        lock.acquire()
        print >> out_fh, row
        out_fh.flush()
        lock.release()
    logging.debug("job <%d> finished" % proc.pid)
Example #6
0
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> proc.stdin, "\n".join(kmers)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\
                    format(K, fastafile, jfdb, binfile))
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
Example #7
0
def blastplus(out_fh, cmd, query, lock):
    cmd += " -query {0}".format(query)
    proc = Popen(cmd)

    logging.debug("job <%d> started: %s" % (proc.pid, cmd))
    for row in proc.stdout:
        if row[0] == '#':
            continue
        lock.acquire()
        out_fh.write(row)
        out_fh.flush()
        lock.release()
    logging.debug("job <%d> finished" % proc.pid)
Example #8
0
def last(k, n, out_fh, cmd, query, lock):

    proc = Popen(cmd, stdin=PIPE)

    parser = SeqIO.parse(query, "fasta")
    for rec in islice(parser, k - 1, None, n):
        SeqIO.write([rec], proc.stdin, "fasta")
    proc.stdin.close()

    logging.debug("job <%d> started: %s" % (proc.pid, cmd))
    for row in proc.stdout:
        if row[0] == '#':
            continue
        lock.acquire()
        out_fh.write(row)
        out_fh.flush()
        lock.release()
    logging.debug("job <%d> finished" % proc.pid)
Example #9
0
 def run(self):
     r = Popen(str(self))
     return r.communicate()
Example #10
0
File: ks.py Project: ascendo/jcvi
 def run(self):
     r = Popen(str(self))
     return r.communicate()