def blast(args): """ %prog blast fastafile Run BLASTN against database (default is UniVec_Core). Output .bed format on the vector/contaminant ranges. """ p = OptionParser(blast.__doc__) p.add_option("--dist", dest="dist", default=100, type="int", help="Merge adjacent HSPs separated by [default: %default]") p.add_option("--db", dest="db", default=None, help="Use a different database rather than UniVec_Core") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastaprefix = fastafile.split(".", 1)[0] univec = opts.db or download( "ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core") uniprefix = univec.split(".", 1)[0] fastablast = fastaprefix + ".{0}.blast".format(uniprefix) prog = run_megablast if opts.db else run_vecscreen prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50) fp = open(fastablast) ranges = [] for row in fp: b = BlastLine(row) ranges.append((b.query, b.qstart, b.qstop)) merged_ranges = range_merge(ranges, dist=opts.dist) bedfile = fastaprefix + ".{0}.bed".format(uniprefix) fw = must_open(bedfile, "w") for seqid, start, end in merged_ranges: print >> fw, "\t".join( str(x) for x in (seqid, start - 1, end, uniprefix)) return bedfile
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option( "--om", default=False, action="store_true", help="The bedfile is OM blocks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
def blast(args): """ %prog blast fastafile Run BLASTN against database (default is UniVec_Core). Output .bed format on the vector/contaminant ranges. """ p = OptionParser(blast.__doc__) p.add_option("--dist", default=100, type="int", help="Merge adjacent HSPs separated by [default: %default]") p.add_option("--db", help="Use a different database rather than UniVec_Core") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastaprefix = fastafile.split(".", 1)[0] univec = opts.db or download("ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core") uniprefix = univec.split(".", 1)[0] fastablast = fastaprefix + ".{0}.blast".format(uniprefix) prog = run_megablast if opts.db else run_vecscreen prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50) fp = open(fastablast) ranges = [] for row in fp: b = BlastLine(row) ranges.append((b.query, b.qstart, b.qstop)) merged_ranges = range_merge(ranges, dist=opts.dist) bedfile = fastaprefix + ".{0}.bed".format(uniprefix) fw = must_open(bedfile, "w") for seqid, start, end in merged_ranges: print("\t".join(str(x) for x in (seqid, start - 1, end, uniprefix)), file=fw) return bedfile
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option("--om", default=False, action="store_true", help="The bedfile is OM blocks [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print "\t".join(str(x) for x in (seqid, mmin - 1, mmax))
def test_range_merge(ranges, dist, expected): from jcvi.utils.range import range_merge assert range_merge(ranges, dist) == expected