def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.print_to_file(sorted=True)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.sort(key=newbed.nullkey) newbed.print_to_file()
def refine(args): """ %prog refine bedfile1 bedfile2 refinedbed Refine bed file using a second bed file. The final bed is keeping all the intervals in bedfile1, but refined by bedfile2 whenever they have intersection. """ p = OptionParser(refine.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) abedfile, bbedfile, refinedbed = args fw = open(refinedbed, "w") intersected = refined = 0 for a, b in intersectBed_wao(abedfile, bbedfile): if b is None: print >> fw, a continue intersected += 1 aspan_before = a.span arange = (a.start, a.end) brange = (b.start, b.end) irange = range_intersect(arange, brange) a.start, a.end = irange aspan_after = a.span if aspan_before > aspan_after: refined += 1 print >> fw, a fw.close() print >> sys.stderr, "Total intersected: {0}".format(intersected) print >> sys.stderr, "Total refined: {0}".format(refined) summary([abedfile]) summary([refinedbed])
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ p = OptionParser(connect.__doc__) p.add_option( "--clip", default=2000, type="int", help="Only consider end of contigs", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print("\n".join(str(x) for x in bb)) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print(ov, alen, blen) if ov and (ov > alen / 2 or ov > blen / 2): print("Too much overlap ({0})".format(ov)) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, fastafile, verbose=False)
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in componets to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option("--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ from jcvi.formats.sizes import Sizes from jcvi.formats.blast import Blast from jcvi.utils.iter import pairwise from jcvi.utils.range import range_intersect from jcvi.algorithms.graph import BiGraph, BiEdge from jcvi.assembly.syntenypath import graph_to_agp p = OptionParser(connect.__doc__) p.add_option("--clip", default=2000, type="int", help="Only consider end of contigs [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print "\n".join(str(x) for x in bb) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print ov, alen, blen if ov and (ov > alen / 2 or ov > blen / 2): print "Too much overlap ({0})".format(ov) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" e = BiEdge(asub, bsub, atag, btag) g.add_edge(e) print "=" * 5, e graph_to_agp(g, blastfile, fastafile, verbose=False)
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in components to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option( "--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline #assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def test_range_intersect(a, b, expected): from jcvi.utils.range import range_intersect assert range_intersect(a, b) == expected