def uniq(args): """ %prog uniq bedfile > newbedfile Remove overlapping features with higher scores. """ p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args uniqbedfile = bedfile.split(".")[0] + ".uniq.bed" bed = Bed(bedfile) if not need_update(bedfile, uniqbedfile): return uniqbedfile ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \ for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [bed[x.id] for x in selected] newbed = Bed() newbed.extend(selected) newbed.print_to_file(uniqbedfile, sorted=True) logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) return uniqbedfile
def get_segments(ranges, extra, minsegment=40): """ Given a list of Range, perform chaining on the ranges and select a highest scoring subset and cut based on their boundaries. Let's say the projection of the synteny blocks onto one axis look like the following. 1=====10......20====30....35====~~ Then the segmentation will yield a block [1, 20), [20, 35), using an arbitrary right extension rule. Extra are additional end breaks for chromosomes. """ from jcvi.utils.range import range_chain, LEFT, RIGHT NUL = 2 selected, score = range_chain(ranges) endpoints = [(x.start, NUL) for x in selected] endpoints += [(x[0], LEFT) for x in extra] endpoints += [(x[1], RIGHT) for x in extra] endpoints.sort() current_left = 0 for a, ai in endpoints: if ai == LEFT: current_left = a if ai == RIGHT: yield current_left, a elif ai == NUL: if a - current_left < minsegment: continue yield current_left, a - 1 current_left = a
def select_bed(bed): """ Return non-overlapping set of ranges, choosing high scoring blocks over low scoring alignments when there are conflicts. """ ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [bed[x.id] for x in selected] return selected
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [ to_range(x, score=gene_register[x.accn], id=x.accn) for x in group ] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print("\n".join(sorted(removed)), file=fw) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [to_range(x, score=gene_register[x.accn], id=x.accn) \ for x in group] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print >> fw, "\n".join(sorted(removed)) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def uniq(args): """ %prog uniq bedfile Remove overlapping features with higher scores. """ p = OptionParser(uniq.__doc__) p.add_option("--slen", default=False, action="store_true", help="Use sequence length as score [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args uniqbedfile = bedfile.split(".")[0] + ".uniq.bed" bed = Bed(bedfile) if opts.slen: ranges = [Range(x.seqid, x.start, x.end, x.end - x.start, i) \ for i, x in enumerate(bed)] else: ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \ for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [x.id for x in selected] selected_ids = set(selected) selected = [bed[x] for x in selected] notselected = [x for i, x in enumerate(bed) if i not in selected_ids] newbed = Bed() newbed.extend(selected) newbed.print_to_file(uniqbedfile, sorted=True) if notselected: leftoverfile = bedfile.split(".")[0] + ".leftover.bed" leftoverbed = Bed() leftoverbed.extend(notselected) leftoverbed.print_to_file(leftoverfile, sorted=True) logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) return uniqbedfile
def test_range_chain(ranges, expected): from jcvi.utils.range import range_chain assert range_chain(ranges) == expected
def mcscan(args): """ %prog mcscan bedfile anchorfile [options] Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. If --mergetandem=tandem_file is specified, tandem_file should have each tandem cluster as one line, tab separated. """ p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option( "--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") p.add_option( "--Nm", default=10, type="int", help="Clip block ends to allow slight overlaps [default: %default]") p.add_option("--trackids", action="store_true", help="Track block IDs in separate file [default: %default]") p.add_option("--mergetandem", default=None, help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\ "cannot be used with --ascii") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii clip = opts.Nm trackids = opts.trackids ofile = opts.outfile mergetandem = opts.mergetandem bed = Bed(bedfile) order = bed.order if trackids: olog = ofile + ".tracks" fwlog = must_open(olog, "w") if mergetandem: assert not ascii tandems = {} for row in file(mergetandem): row = row.split() s = ";".join(row) for atom in row: tandems[atom] = s ac = AnchorFile(anchorfile) ranges = [] block_pairs = defaultdict(dict) blocks = ac.blocks for i, ib in enumerate(blocks): q, s, t = zip(*ib) if q[0] not in order: q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) assert q[0] in order if s[0] not in order: continue # is_self comparison q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) fw = must_open(ofile, "w") tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) if trackids: print >> fwlog, ",".join(str(x) for x in sorted(selected)) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 mbed = [] for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) mbed.append((id, atoms)) for id, atoms in mbed: sep = "" if ascii else "\t" if mergetandem: for i, atom in enumerate(atoms): atoms[i] = tandems.get(atom, atom) print >> fw, "\t".join((id, sep.join(atoms))) logging.debug("MCscan blocks written to `{0}`.".format(ofile)) if trackids: logging.debug("Block IDs written to `{0}`.".format(olog))
def supermap(blast_file, filter="intersection", dialect="blast", clip=0): # filter by query if filter != "ref": logging.debug("filter by query") ranges = list(BlastOrCoordsLine(blast_file, filter="query", dialect=dialect, clip=clip)) query_selected, query_score = range_chain(ranges) query_idx = set(x.id for x in query_selected) # filter by ref if filter != "query": logging.debug("filter by ref") ranges = list(BlastOrCoordsLine(blast_file, filter="ref", dialect=dialect, clip=clip)) ref_selected, ref_score = range_chain(ranges) ref_idx = set(x.id for x in ref_selected) if filter == "ref": selected_idx = ref_idx elif filter == "query": selected_idx = query_idx elif filter == "intersection": logging.debug("perform intersection") selected_idx = ref_idx & query_idx elif filter == "union": logging.debug("perform union") selected_idx = ref_idx | query_idx assert len(selected_idx) != 0 # selected_idx is in fact the lineno in the BLAST file fp = open(blast_file) if filter == "intersection": tag = "" else: tag = "." + filter supermapfile = blast_file + tag + ".supermap" fw = open(supermapfile, "w") selected_idx = iter(sorted(selected_idx)) selected = selected_idx.next() for i, row in enumerate(fp): if i < selected: continue print >> fw, row.rstrip() try: selected = selected_idx.next() except StopIteration: break logging.debug("Write output file to `{0}`".format(supermapfile)) fw.close() from jcvi.formats.blast import sort ofilter = "ref" if filter == "ref" else "query" args = [supermapfile, "--" + ofilter] if dialect == "coords": args += ["--coords"] sort(args) return supermapfile
def mcscan(args): """ %prog mcscan bedfile anchorfile Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. """ from jcvi.utils.range import Range, range_chain p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option( "--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii bed = Bed(bedfile) order = bed.order ac = AnchorFile(anchorfile) ranges = [] block_pairs = {} for i, (q, s) in enumerate(ac.iter_blocks()): if q[0] not in order: q, s = s, q pairs = dict(zip(q, s)) block_pairs[i] = pairs q = [order[x] for x in q] q.sort() ranges.append(Range("0", q[0], q[-1], score=len(q), id=i)) tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) sep = "" if ascii else "\t" print "\t".join((id, sep.join(atoms)))
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug("Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format(lb, ub)) for b in bed: if float(b.score) > ub: continue print >> fw, b fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print >> fw, b b.start, b.end = max(start, end - flank + 1), end print >> fw, b fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile)]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)) ies_id += 1 fw.close()
def mcscan(args): """ %prog mcscan bedfile anchorfile [options] Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. If --mergetandem=tandem_file is specified, tandem_file should have each tandem cluster as one line, tab separated. """ p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option("--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") p.add_option("--Nm", default=10, type="int", help="Clip block ends to allow slight overlaps [default: %default]") p.add_option("--trackids", action="store_true", help="Track block IDs in separate file [default: %default]") p.add_option("--mergetandem", default=None, help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\ "cannot be used with --ascii") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii clip = opts.Nm trackids = opts.trackids ofile = opts.outfile mergetandem = opts.mergetandem bed = Bed(bedfile) order = bed.order if trackids: olog = ofile + ".tracks" fwlog = must_open(olog, "w") if mergetandem: assert not ascii tandems = {} for row in file(mergetandem): row = row.split() s = ";".join(row) for atom in row: tandems[atom] = s ac = AnchorFile(anchorfile) ranges = [] block_pairs = defaultdict(dict) blocks = ac.blocks for i, ib in enumerate(blocks): q, s, t = zip(*ib) if q[0] not in order: q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) assert q[0] in order if s[0] not in order: continue # is_self comparison q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) fw = must_open(ofile, "w") tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) if trackids: print >> fwlog, ",".join(str(x) for x in sorted(selected)) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 mbed = [] for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) mbed.append((id, atoms)) for id, atoms in mbed: sep = "" if ascii else "\t" if mergetandem: for i, atom in enumerate(atoms): atoms[i] = tandems.get(atom, atom) print >> fw, "\t".join((id, sep.join(atoms))) logging.debug("MCscan blocks written to `{0}`.".format(ofile)) if trackids: logging.debug("Block IDs written to `{0}`.".format(olog))
def mcscan(args): """ %prog mcscan bedfile anchorfile Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. """ from jcvi.utils.range import Range, range_chain p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option("--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii bed = Bed(bedfile) order = bed.order ac = AnchorFile(anchorfile) ranges = [] block_pairs = {} for i, (q, s) in enumerate(ac.iter_blocks()): if q[0] not in order: q, s = s, q pairs = dict(zip(q, s)) block_pairs[i] = pairs q = [order[x] for x in q] q.sort() ranges.append(Range("0", q[0], q[-1], score=len(q), id=i)) tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) sep = "" if ascii else "\t" print "\t".join((id, sep.join(atoms)))
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')), file=fw) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)), file=fw) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print(b, file=fw) fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print(b, file=fw) b.start, b.end = max(start, end - flank + 1), end print(b, file=fw) fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)), file=fw) ies_id += 1 fw.close()
def supermap(blast_file, filter="intersection", dialect="blast", clip=0): # filter by query if filter != "ref": logging.debug("filter by query") ranges = list( BlastOrCoordsLine(blast_file, filter="query", dialect=dialect, clip=clip)) query_selected, query_score = range_chain(ranges) query_idx = set(x.id for x in query_selected) # filter by ref if filter != "query": logging.debug("filter by ref") ranges = list( BlastOrCoordsLine(blast_file, filter="ref", dialect=dialect, clip=clip)) ref_selected, ref_score = range_chain(ranges) ref_idx = set(x.id for x in ref_selected) if filter == "ref": selected_idx = ref_idx elif filter == "query": selected_idx = query_idx elif filter == "intersection": logging.debug("perform intersection") selected_idx = ref_idx & query_idx elif filter == "union": logging.debug("perform union") selected_idx = ref_idx | query_idx assert len(selected_idx) != 0 # selected_idx is in fact the lineno in the BLAST file fp = open(blast_file) if filter == "intersection": tag = "" else: tag = "." + filter supermapfile = blast_file + tag + ".supermap" fw = open(supermapfile, "w") selected_idx = iter(sorted(selected_idx)) selected = next(selected_idx) for i, row in enumerate(fp): if i < selected: continue print(row.rstrip(), file=fw) try: selected = next(selected_idx) except StopIteration: break logging.debug("Write output file to `{0}`".format(supermapfile)) fw.close() from jcvi.formats.blast import sort ofilter = "ref" if filter == "ref" else "query" args = [supermapfile, "--" + ofilter] if dialect == "coords": args += ["--coords"] sort(args) return supermapfile