def must_open(filename, mode="r", checkexists=False, skipcheck=False): """ Accepts filename and returns filehandle. Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file. """ if isinstance(filename, list): assert "r" in mode import fileinput return fileinput.input(filename) if filename in ("-", "stdin"): assert "r" in mode fp = sys.stdin elif filename == "stdout": assert "w" in mode fp = sys.stdout elif filename == "stderr": assert "w" in mode fp = sys.stderr elif filename == "tmp" and mode == "w": from tempfile import NamedTemporaryFile fp = NamedTemporaryFile(delete=False) elif filename.endswith(".gz"): if 'r' in mode: cmd = "zcat {0}".format(filename) fp = popen(cmd, debug=False) elif 'w' in mode: import gzip fp = gzip.open(filename, mode) elif filename.endswith(".bz2"): if 'r' in mode: cmd = "bzcat {0}".format(filename) fp = popen(cmd, debug=False) elif 'w' in mode: import bz2 fp = bz2.BZ2File(filename, mode) else: if checkexists: assert mode == "w" overwrite = (not op.exists(filename)) if skipcheck \ else check_exists(filename) if overwrite: fp = open(filename, "w") else: logging.debug("File `{0}` already exists. Skipped."\ .format(filename)) return None else: fp = open(filename, mode) return fp
def ls_s3(s3_store_obj_name): s3_store_obj_name = s3ify(s3_store_obj_name) cmd = "aws s3 ls {0}/".format(s3_store_obj_name) contents = [] for row in popen(cmd): contents.append(row.split()[-1]) return contents
def append(args): """ %prog append bamfile Append /1 or /2 to read names. Useful for using the Tophat2 bam file for training AUGUSTUS gene models. """ p = OptionParser(append.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args icmd = "samtools view -h {0}".format(bamfile) bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam" ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile) p = Popen(ocmd, stdin=PIPE) for row in popen(icmd): if row[0] == '@': print >> p.stdin, row.strip() else: s = SamLine(row) s.update_readname() print >> p.stdin, s
def get_mixture(data, components): """ probs = [.476, .509] mus = [.69069, -.15038] variances = [.468982e-1, .959052e-1] """ from jcvi.apps.base import popen probs, mus, sigmas = [], [], [] fw = must_open("tmp", "w") log_data = [log(x) for x in data if x > .05] data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "") fw.write(data) fw.close() cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name) pipe = popen(cmd) for row in pipe: if row[0] != '#': continue atoms = row.split(",") a, b, c = atoms[1:4] a = float(a) b = float(b) c = float(c) mus.append(a) sigmas.append(b) probs.append(c) os.remove(fw.name) return probs, mus, sigmas
def top10(args): """ %prog top10 blastfile.best Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened the get the best match. You can also provide an .ids file to query the ids. For example the ids file can contain the seqid to species mapping. The ids file is two-column, and can sometimes be generated by `jcvi.formats.fasta ids --description`. """ from jcvi.formats.base import DictFile p = OptionParser(top10.__doc__) p.add_option("--top", default=10, type="int", help="Top N taxa to extract [default: %default]") p.add_option("--ids", default=None, help="Two column ids file to query seqid [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {} cmd = "cut -f2 {0}".format(blastfile) cmd += " | sort | uniq -c | sort -k1,1nr | head -n {0}".format(opts.top) fp = popen(cmd) for row in fp: count, seqid = row.split() nseqid = mapping.get(seqid, seqid) print "\t".join((count, nseqid))
def locate(args): fi, fo = args.fi, args.fo seq = args.seq mtfs = read_motif(fi, args.motif) # mtf_str = " ".join([f'--motif {mid}' for mid, wd, score in mtfs]) pre = f"tmp.lc{random.randrange(1000)}" # sh(f'fimo --bfile --motif-- {mtf_str} --thresh 1e-4 --skip-matched-sequence --text {fi} {seq} > {pre}_0.txt' ) for mid, wd, score in mtfs: sh(f'grep -P "^{mid}\t" {pre}_0.txt > {pre}_0a.txt') # score_thresh = score if not score: xh = popen( f'cut -f7 {pre}_0a.txt | sed \'1d\' | sort -k1,1nr | head') max_score = float(xh.readline().decode("utf-8").strip()) score_thresh = max_score * args.score_thresh # sh("bioawk -tH '{if($7>%f) {print $1\"%%\"$3, $4-1, $5}}' %s_0a.txt > %s_1.bed" % (score_thresh, pre, pre)) hwd = round(wd * args.motif_frac) if os.stat(f"{pre}_1.bed").st_size == 0: sh(f'touch {pre}_4_{mid}.bed') else: sh(f'sortBed -i {pre}_1.bed | mergeBed > {pre}_2.bed') sh(f'bedtools makewindows -w {wd} -b {pre}_2.bed > {pre}_3.bed') sh(f'bed.py filter --minsize {hwd} {pre}_3.bed > {pre}_4_{mid}.bed' ) sh(f'cat {pre}_4_*.bed > {fo}') if not args.debug: sh(f'rm -rf {pre}_*')
def top10(args): """ %prog top10 blastfile.best Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened the get the best match. You can also provide an .ids file to query the ids. For example the ids file can contain the seqid to species mapping. The ids file is two-column, and can sometimes be generated by `jcvi.formats.fasta ids --description`. """ from jcvi.formats.base import DictFile p = OptionParser(top10.__doc__) p.add_option("--ids", default=None, help="Two column ids file to query seqid [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {} cmd = "cut -f2 {0}".format(blastfile) cmd += " | sort | uniq -c | sort -k1,1nr | head" fp = popen(cmd) for row in fp: count, seqid = row.split() nseqid = mapping.get(seqid, seqid) print "\t".join((count, nseqid))
def append(args): """ %prog append bamfile Append /1 or /2 to read names. Useful for using the Tophat2 bam file for training AUGUSTUS gene models. """ p = OptionParser(append.__doc__) p.add_option("--prepend", help="Prepend string to read names") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bamfile,) = args prepend = opts.prepend icmd = "samtools view -h {0}".format(bamfile) bamfile = bamfile.rsplit(".", 1)[0] + ".append.bam" ocmd = "samtools view -b -@ 64 - -o {0}".format(bamfile) p = Popen(ocmd, stdin=PIPE) for row in popen(icmd): if row[0] == "@": print(row.strip(), file=p.stdin) else: s = SamLine(row) if prepend: s.qname = prepend + "_" + s.qname else: s.update_readname() print(s, file=p.stdin)
def validate(args): """ %prog validate outdir genome.fasta Validate current folder after MAKER run and check for failures. Failed batch will be written to a directory for additional work. """ from jcvi.utils.counter import Counter p = OptionParser(validate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, genome = args counter = Counter() fsnames, suffix = get_fsnames(outdir) dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log" dslogs = [dsfile.format(x, suffix) for x in fsnames] all_failed = [] for f, d in zip(fsnames, dslogs): dslog = DatastoreIndexFile(d) counter.update(dslog.scaffold_status.values()) all_failed.extend([(f, x) for x in dslog.failed]) cmd = 'tail maker.*.out | grep -c "now finished"' n = int(popen(cmd).read()) assert len(fsnames) == n print("ALL jobs have been finished", file=sys.stderr) nfailed = len(all_failed) if nfailed == 0: print("ALL scaffolds are completed with no errors", file=sys.stderr) return print("Scaffold status:", file=sys.stderr) print(counter, file=sys.stderr) failed = "FAILED" fw = open(failed, "w") print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw) fw.close() nlines = sum(1 for x in open("FAILED")) assert nlines == nfailed print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr) # Rebuild the failed batch failed_ids = failed + ".ids" failed_fasta = failed + ".fasta" cmd = "cut -f2 {0}".format(failed) sh(cmd, outfile=failed_ids) if need_update((genome, failed_ids), failed_fasta): cmd = "faSomeRecords {0} {1} {2}".\ format(genome, failed_ids, failed_fasta) sh(cmd)
def ls_s3(s3_store_obj_name, recursive=False): s3_store_obj_name = s3ify(s3_store_obj_name) cmd = "aws s3 ls {0}/".format(s3_store_obj_name) if recursive: cmd += " --recursive" contents = [] for row in popen(cmd): contents.append(row.split()[-1]) return contents
def compare_worker(arg): cnvoutput, truths = arg cmd = "intersectBed -f .5 -F .5" cmd += " -a {} -b {} | wc -l".format(cnvoutput, truths) nlines = int(popen(cmd, debug=False).read()) target_lines = len([x for x in open(cnvoutput)]) truths_lines = len([x for x in open(truths)]) precision = nlines * 100. / target_lines recall = nlines * 100. / truths_lines d = "\t".join(str(x) for x in (cnvoutput, truths, nlines, target_lines, truths_lines, precision, recall)) return d
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print(b, file=fp) fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
def get_splits(split_bed, gff_file, stype, key): """ Use intersectBed to find the fused gene => split genes mappings. """ bed_file = get_bed_file(gff_file, stype, key) cmd = "intersectBed -a {0} -b {1} -wao".format(split_bed, bed_file) cmd += " | cut -f4,10" p = popen(cmd) splits = defaultdict(set) for row in p: a, b = row.split() splits[a].add(b) return splits
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print >> fp, b fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
def start(self, path=sge): if self.is_defunct: return cwd = os.getcwd() if path: os.chdir(path) # Shell commands if "|" in self.cmd or "&&" in self.cmd or "||" in self.cmd: quote = "\"" if "'" in self.cmd else "'" self.cmd = "sh -c {1}{0}{1}".format(self.cmd, quote) # qsub command (the project code is specific to jcvi) qsub = "qsub -P {0} -cwd".format(PCODE) if self.queue != "default": qsub += " -l {0}".format(self.queue) if self.threaded: qsub += " -pe threaded {0}".format(self.threaded) if self.infile: qsub += " -i {0}".format(self.infile) if self.outfile: qsub += " -o {0}".format(self.outfile) if self.errfile: qsub += " -e {0}".format(self.errfile) cmd = " ".join((qsub, self.cmd)) # run the command and get the job-ID (important) output = popen(cmd, debug=False).read() if output.strip() != "": self.jobid = re.search(self.pat, output).group("id") else: self.jobid = "-1" msg = "[{0}] {1}".format(self.jobid, self.cmd) if self.infile: msg += " < {0} ".format(self.infile) if self.outfile: backup(self.outfile) msg += " > {0} ".format(self.outfile) if self.errfile: backup(self.errfile) msg += " 2> {0} ".format(self.errfile) logging.debug(msg) os.chdir(cwd)
def ls_s3(s3_store_obj_name, recursive=False): s3_store_obj_name = s3ify(s3_store_obj_name) cmd = "aws s3 ls {0}/".format(s3_store_obj_name) contents = [] for row in popen(cmd): f = row.split()[-1] f = op.join(s3_store_obj_name, f) contents.append(f) if recursive: que = [x for x in contents if x.endswith("/")] while que: f = que.pop(0).rstrip("/") contents += ls_s3(f, recursive=True) return contents
def gaps(args): """ %prog gaps idsfile fractionationfile gapsbed Check gene locations against gaps. `idsfile` contains a list of IDs to query into `fractionationfile` in order to get expected locations. """ from jcvi.formats.base import DictFile from jcvi.apps.base import popen from jcvi.utils.cbook import percentage p = OptionParser(gaps.__doc__) p.add_option("--bdist", default=0, type="int", help="Base pair distance [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) idsfile, frfile, gapsbed = args bdist = opts.bdist d = DictFile(frfile, keypos=1, valuepos=2) bedfile = idsfile + ".bed" fw = open(bedfile, "w") fp = open(idsfile) total = 0 for row in fp: id = row.strip() hit = d[id] tag, pos = get_tag(hit, None) seqid, start, end = pos start, end = max(start - bdist, 1), end + bdist print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id)) total += 1 fw.close() cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed) not_in_gaps = popen(cmd).read() not_in_gaps = int(not_in_gaps) in_gaps = total - not_in_gaps print >> sys.stderr, "Ids in gaps: {1}".\ format(total, percentage(in_gaps, total))
def removecontains(args): """ %prog removecontains 4-unitigger/best.contains asm.gkpStore Remove contained reads from gkpStore. This will improve assembly contiguity without sacrificing accuracy, when using bogart unitigger. """ p = OptionParser(removecontains.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) contains, gkpStore = args s = set() fp = open(contains) for row in fp: if row[0] == "#": continue iid = int(row.split()[0]) s.add(iid) cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore) gkpmsg = popen(cmd).read() last_iid = int(gkpmsg.strip().split()[-1]) ndeleted = 0 editfile = "delete.edit" fw = open(editfile, "w") for iid in range(1, last_iid + 1): if iid in s: print("frg iid {0} isdeleted 1".format(iid), file=fw) ndeleted += 1 fw.close() assert len(s) == ndeleted logging.debug( "A total of {0} contained reads flagged as deleted.".format(ndeleted)) print("Now you can run:", file=sys.stderr) print("$ gatekeeper --edit {0} {1}".format(editfile, gkpStore), file=sys.stderr)
def removecontains(args): """ %prog removecontains 4-unitigger/best.contains asm.gkpStore Remove contained reads from gkpStore. This will improve assembly contiguity without sacrificing accuracy, when using bogart unitigger. """ p = OptionParser(removecontains.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) contains, gkpStore = args s = set() fp = open(contains) for row in fp: if row[0] == '#': continue iid = int(row.split()[0]) s.add(iid) cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore) gkpmsg = popen(cmd).read() last_iid = int(gkpmsg.strip().split()[-1]) ndeleted = 0 editfile = "delete.edit" fw = open(editfile, "w") for iid in xrange(1, last_iid + 1): if iid in s: print >> fw, "frg iid {0} isdeleted 1".format(iid) ndeleted += 1 fw.close() assert len(s) == ndeleted logging.debug("A total of {0} contained reads flagged as deleted."\ .format(ndeleted)) print >> sys.stderr, "Now you can run:" print >> sys.stderr, "$ gatekeeper --edit {0} {1}".format(editfile, gkpStore)
def start(self): cmd = self.build() # run the command and get the job-ID (important) output = popen(cmd, debug=False).read() if output.strip() != "": self.jobid = re.search(self.pat, output).group("id") else: self.jobid = "-1" msg = "[{0}] {1}".format(self.jobid, self.cmd) if self.infile: msg += " < {0} ".format(self.infile) if self.outfile: backup(self.outfile) msg += " > {0} ".format(self.outfile) if self.errfile: backup(self.errfile) msg += " 2> {0} ".format(self.errfile) logging.debug(msg)
def intersectBed_wao(abedfile, bbedfile, minOverlap=0): abed = Bed(abedfile) bbed = Bed(bbedfile) print >> sys.stderr, "`{0}` has {1} features.".format(abedfile, len(abed)) print >> sys.stderr, "`{0}` has {1} features.".format(bbedfile, len(bbed)) cmd = "intersectBed -wao -a {0} -b {1}".format(abedfile, bbedfile) acols = abed[0].nargs bcols = bbed[0].nargs fp = popen(cmd) for row in fp: atoms = row.split() aline = "\t".join(atoms[:acols]) bline = "\t".join(atoms[acols:acols + bcols]) c = int(atoms[-1]) if c < minOverlap: continue a = BedLine(aline) try: b = BedLine(bline) except AssertionError: b = None yield a, b
def must_open(filename, mode="r", checkexists=False, skipcheck=False, oappend=False): """ Accepts filename and returns filehandle. Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file. """ if isinstance(filename, list): assert "r" in mode if filename[0].endswith((".gz", ".bz2")): filename = " ".join( filename) # allow opening multiple gz/bz2 files else: import fileinput return fileinput.input(filename) if filename.startswith("s3://"): from jcvi.utils.aws import pull_from_s3 filename = pull_from_s3(filename) if filename in ("-", "stdin"): assert "r" in mode fp = sys.stdin elif filename == "stdout": assert "w" in mode fp = sys.stdout elif filename == "stderr": assert "w" in mode fp = sys.stderr elif filename == "tmp" and mode == "w": from tempfile import NamedTemporaryFile fp = NamedTemporaryFile(mode=mode, delete=False) elif filename.endswith(".gz"): import gzip if "r" in mode: fp = gzip.open(filename, mode + "t") elif "w" in mode: fp = gzip.open(filename, mode) elif filename.endswith(".bz2"): if "r" in mode: cmd = "bzcat {0}".format(filename) fp = popen(cmd, debug=False) elif "w" in mode: import bz2 fp = bz2.BZ2File(filename, mode) else: if checkexists: assert mode == "w" overwrite = ((not op.exists(filename)) if skipcheck else check_exists(filename, oappend)) if overwrite: if oappend: fp = open(filename, "a") else: fp = open(filename, "w") else: logging.debug( "File `{0}` already exists. Skipped.".format(filename)) return None else: fp = open(filename, mode) return fp
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.apps.command import BLPATH from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, afasta + ".fasta") if not op.exists(af): # Check to avoid redownload fetch([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, bfasta + ".fasta") if not op.exists(bf): fetch([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = BLPATH("blastn") cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= GoodOverlap] dist = 2 * GoodOverlap # Distance to chain the HSPs if chain: logging.debug("Chain HSPs in the Blast output.") hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize) o.print_graphic(qreverse=opts.qreverse) print >> sys.stderr, str(o) return o
def check_exists_s3(s3_store_obj_name): s3_store_obj_name = s3ify(s3_store_obj_name) cmd = "aws s3 ls {0} | wc -l".format(s3_store_obj_name) counts = int(popen(cmd).read()) return counts != 0
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--suffix", default="fasta", help="Suffix of the sequence file in dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01) p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain suffix = opts.suffix evalue = opts.evalue pctid = opts.pctid hitlen = opts.hitlen cutoff = Cutoff(pctid, hitlen) # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, ".".join((afasta, suffix))) if not op.exists(af): # Check to avoid redownload entrez([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, ".".join((bfasta, suffix))) if not op.exists(bf): entrez([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = "blastn -dust no" cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= hitlen] if chain: logging.debug("Chain HSPs in the Blast output.") dist = 2 * hitlen # Distance to chain the HSPs hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse) o.print_graphic() if opts.outfile: fw = must_open(opts.outfile, "w") print >> fw, str(o) fw.close() return o
def overlap(args): """ %prog overlap best.contains iid Visualize overlaps for a given fragment. Must be run in 4-unitigger. All overlaps for iid were retrieved, excluding the ones matching best.contains. """ from jcvi.apps.console import green p = OptionParser(overlap.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") p.add_option("--canvas", default=100, type="int", help="Canvas size") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bestcontains, iid = args canvas = opts.canvas bestcontainscache = bestcontains + ".cache" if need_update(bestcontains, bestcontainscache): fp = open(bestcontains) fw = open(bestcontainscache, "w") exclude = set() for row in fp: if row[0] == "#": continue j = int(row.split()[0]) exclude.add(j) cPickle.dump(exclude, fw) fw.close() exclude = cPickle.load(open(bestcontainscache)) logging.debug("A total of {0} reads to exclude".format(len(exclude))) cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid) cmd += " -E {0}".format(opts.maxerr) frags = [] for row in popen(cmd): r = OverlapLine(row) if r.bid in exclude: continue frags.append(r) # Also include to query fragment frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid))) frags.sort(key=lambda x: x.ahang) # Determine size of the query fragment cmd = "gatekeeper -b {0} -e {0}".format(iid) cmd += " -tabular -dumpfragments ../asm.gkpStore" fp = popen(cmd) row = fp.next() size = int(fp.next().split()[-1]) # Determine size of canvas xmin = min(x.ahang for x in frags) xmax = max(x.bhang for x in frags) xsize = -xmin + size + xmax ratio = xsize / canvas fw = sys.stdout for f in frags: fsize = -f.ahang + size + f.bhang a = (f.ahang - xmin) / ratio b = fsize / ratio t = "-" * b if f.orientation == "N": t = t[:-1] + ">" else: t = "<" + t[1:] if f.ahang == 0 and f.bhang == 0: t = green(t) c = canvas - a - b fw.write(" " * a) fw.write(t) fw.write(" " * c) print >> fw, "{0} ({1})".format(str(f.bid).rjust(10), f.erate_adj)
def get_grid_engine(): cmd = "qsub --version" ret = popen(cmd, debug=False).read() return "PBS" if "PBS" in ret else "SGE"
def allocate(self, info, chr, start_id, end_id, id_table): start_bp = info[0].start end_bp = info[-1].end current_chr = chr_number(chr) needed = info assert end_id > start_id, \ "end ({0}) > start ({1})".format(end_id, start_id) spots = end_id - start_id - 1 available = [ x for x in xrange(start_id + 1, end_id) if (current_chr, x) not in self.black ] message = "{0} need {1} ids, has {2} spots ({3} available)".\ format(chr, len(needed), spots, len(available)) start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) end_gene = gene_name(current_chr, end_id, prefix=self.prefix, pad0=self.pad0, uc=self.uc) message += " between {0} - {1}\n".format(start_gene, end_gene) assert end_bp > start_bp b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp)) cmd = "echo '{0}' |".format(b) cmd += " intersectBed -a {0} -b stdin".format(self.gapfile) gaps = list(BedLine(x) for x in popen(cmd, debug=False)) ngaps = len(gaps) gapsexpanded = [] GeneDensity = 10000. # assume 10Kb per gene for gap in gaps: gap_bp = int(gap.score) gap_ids = int(round(gap_bp / GeneDensity)) gapsexpanded += [gap] * gap_ids lines = sorted(info + gapsexpanded, key=lambda x: x.start) message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\ format(start_bp, end_bp, ngaps, len(lines)) needed = lines stride = Stride(needed, available) conf = stride.conf message += " stride: {0}".format(conf) print >> sys.stderr, message nneeded = len(needed) if conf is None: # prefix rule - prepend version number for spills magic = 400000 # version 4 firstdigit = 100000 step = 10 # stride for the prefixed ids rank = start_id + magic if rank > magic + firstdigit: rank -= firstdigit available = [] while len(available) != nneeded: rank += step if (current_chr, rank) in self.black: # avoid blacklisted ids continue available.append(rank) else: # follow the best stride available = stride.available if start_id == 0: # follow right flank at start of chr available = available[-nneeded:] else: # follow left flank otherwise available = available[:nneeded] # Finally assign the ids assert len(needed) == len(available) for b, rank in zip(needed, available): name = gene_name(current_chr, rank, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) print >> sys.stderr, "\t".join((str(b), name)) id_table[b.accn] = name self.black.add((current_chr, rank)) print >> sys.stderr
def overlap(args): """ %prog overlap best.contains iid Visualize overlaps for a given fragment. Must be run in 4-unitigger. All overlaps for iid were retrieved, excluding the ones matching best.contains. """ p = OptionParser(overlap.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") p.add_option("--canvas", default=100, type="int", help="Canvas size") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bestcontains, iid = args canvas = opts.canvas bestcontainscache = bestcontains + ".cache" if need_update(bestcontains, bestcontainscache): fp = open(bestcontains) fw = open(bestcontainscache, "w") exclude = set() for row in fp: if row[0] == "#": continue j = int(row.split()[0]) exclude.add(j) dump(exclude, fw) fw.close() exclude = load(open(bestcontainscache)) logging.debug("A total of {0} reads to exclude".format(len(exclude))) cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid) cmd += " -E {0}".format(opts.maxerr) frags = [] for row in popen(cmd): r = OverlapLine(row) if r.bid in exclude: continue frags.append(r) # Also include to query fragment frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid))) frags.sort(key=lambda x: x.ahang) # Determine size of the query fragment cmd = "gatekeeper -b {0} -e {0}".format(iid) cmd += " -tabular -dumpfragments ../asm.gkpStore" fp = popen(cmd) row = next(fp) size = int(fp.next().split()[-1]) # Determine size of canvas xmin = min(x.ahang for x in frags) xmax = max(x.bhang for x in frags) xsize = -xmin + size + xmax ratio = xsize / canvas for f in frags: fsize = -f.ahang + size + f.bhang a = (f.ahang - xmin) / ratio b = fsize / ratio t = "-" * b if f.orientation == "N": t = t[:-1] + ">" else: t = "<" + t[1:] if f.ahang == 0 and f.bhang == 0: t = "[green]{}".format(t) c = canvas - a - b printf( "{}{}{}{} ({})".format(" " * a, t, " " * c, str(f.bid).rjust(10), f.erate_adj), )
def get_grid_engine(): cmd = "qsub --version" popen(cmd, debug=False).read() return "PBS" if "PBS" in cmd else "SGE"
def allocate(self, info, chr, start_id, end_id, id_table, extended_stride=False): start_bp = info[0].start end_bp = info[-1].end current_chr = chr_number(chr) needed = info assert end_id > start_id, \ "end ({0}) > start ({1})".format(end_id, start_id) spots = end_id - start_id - 1 available = [x for x in xrange(start_id + 1, end_id) if (current_chr, x) not in self.black] message = "{0} need {1} ids, has {2} spots ({3} available)".\ format(chr, len(needed), spots, len(available)) start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) end_gene = gene_name(current_chr, end_id, prefix=self.prefix, pad0=self.pad0, uc=self.uc) message += " between {0} - {1}\n".format(start_gene, end_gene) assert end_bp > start_bp b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp)) cmd = "echo '{0}' |".format(b) cmd += " intersectBed -a {0} -b stdin".format(self.gapfile) gaps = list(BedLine(x) for x in popen(cmd, debug=False)) ngaps = len(gaps) gapsexpanded = [] GeneDensity = 10000. # assume 10Kb per gene for gap in gaps: gap_bp = int(gap.score) gap_ids = int(round(gap_bp / GeneDensity)) gapsexpanded += [gap] * gap_ids lines = sorted(info + gapsexpanded, key=lambda x: x.start) message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\ format(start_bp, end_bp, ngaps, len(lines)) needed = lines stride = Stride(needed, available, extended=extended_stride) conf = stride.conf message += " stride: {0}".format(conf) print >> sys.stderr, message nneeded = len(needed) if conf is None: # prefix rule - prepend version number for spills magic = 400000 # version 4 firstdigit = 100000 step = 10 # stride for the prefixed ids rank = start_id + magic if rank > magic + firstdigit: rank -= firstdigit available = [] while len(available) != nneeded: rank += step if (current_chr, rank) in self.black: # avoid blacklisted ids continue available.append(rank) else: # follow the best stride available = stride.available if start_id == 0: # follow right flank at start of chr available = available[- nneeded:] else: # follow left flank otherwise available = available[:nneeded] # Finally assign the ids assert len(needed) == len(available) for b, rank in zip(needed, available): name = gene_name(current_chr, rank, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) print >> sys.stderr, "\t".join((str(b), name)) id_table[b.accn] = name self.black.add((current_chr, rank)) print >> sys.stderr