def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print("\n".join(kmers), file=proc.stdin) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug( "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format( K, fastafile, jfdb, binfile ) ) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def append(args): """ %prog append bamfile Append /1 or /2 to read names. Useful for using the Tophat2 bam file for training AUGUSTUS gene models. """ p = OptionParser(append.__doc__) p.add_option("--prepend", help="Prepend string to read names") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bamfile,) = args prepend = opts.prepend icmd = "samtools view -h {0}".format(bamfile) bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam" ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile) p = Popen(ocmd, stdin=PIPE) for row in popen(icmd): if row[0] == "@": print(row.strip(), file=p.stdin) else: s = SamLine(row) if prepend: s.qname = prepend + "_" + s.qname else: s.update_readname() print(s, file=p.stdin)
def append(args): """ %prog append bamfile Append /1 or /2 to read names. Useful for using the Tophat2 bam file for training AUGUSTUS gene models. """ p = OptionParser(append.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args icmd = "samtools view -h {0}".format(bamfile) bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam" ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile) p = Popen(ocmd, stdin=PIPE) for row in popen(icmd): if row[0] == '@': print >> p.stdin, row.strip() else: s = SamLine(row) s.update_readname() print >> p.stdin, s
def lastz_2bit(t): """ Used for formats other than BLAST, i.e. lav, maf, etc. which requires the database file to contain a single FASTA record. """ bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format = t ref_tags = [Darkspace] qry_tags = [Darkspace] ref_tags, qry_tags = add_mask(ref_tags, qry_tags, mask=mask) lastz_cmd = Lastz_template.format(lastz_bin, bfasta_fn, ref_tags, \ afasta_fn, qry_tags) if extra: lastz_cmd += " " + extra.strip() lastz_cmd += " --format={0}".format(format) proc = Popen(lastz_cmd) out_fh = open(outfile, "w") logging.debug("job <%d> started: %s" % (proc.pid, lastz_cmd)) for row in proc.stdout: out_fh.write(row) out_fh.flush() logging.debug("job <%d> finished" % proc.pid)
def lastz(k, n, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask=False): ref_tags = [Multiple, Darkspace] qry_tags = [Darkspace] if n != 1: qry_tags.append(Subsample.format(k, n)) ref_tags, qry_tags = add_mask(ref_tags, qry_tags, mask=mask) lastz_cmd = Lastz_template.format(lastz_bin, bfasta_fn, ref_tags, \ afasta_fn, qry_tags) if extra: lastz_cmd += " " + extra.strip() lastz_cmd += " --format=general-:%s" % lastz_fields # The above conversion is no longer necessary after LASTZ v1.02.40 # (of which I contributed a patch) #lastz_cmd += " --format=BLASTN-" proc = Popen(lastz_cmd) logging.debug("job <%d> started: %s" % (proc.pid, lastz_cmd)) for row in proc.stdout: row = lastz_to_blast(row) lock.acquire() print >> out_fh, row out_fh.flush() lock.release() logging.debug("job <%d> finished" % proc.pid)
def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> proc.stdin, "\n".join(kmers) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\ format(K, fastafile, jfdb, binfile)) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def blastplus(out_fh, cmd, query, lock): cmd += " -query {0}".format(query) proc = Popen(cmd) logging.debug("job <%d> started: %s" % (proc.pid, cmd)) for row in proc.stdout: if row[0] == '#': continue lock.acquire() out_fh.write(row) out_fh.flush() lock.release() logging.debug("job <%d> finished" % proc.pid)
def last(k, n, out_fh, cmd, query, lock): proc = Popen(cmd, stdin=PIPE) parser = SeqIO.parse(query, "fasta") for rec in islice(parser, k - 1, None, n): SeqIO.write([rec], proc.stdin, "fasta") proc.stdin.close() logging.debug("job <%d> started: %s" % (proc.pid, cmd)) for row in proc.stdout: if row[0] == '#': continue lock.acquire() out_fh.write(row) out_fh.flush() lock.release() logging.debug("job <%d> finished" % proc.pid)
def run(self): r = Popen(str(self)) return r.communicate()