def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.print_to_file(sorted=True)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.sort(key=newbed.nullkey) newbed.print_to_file()
def merge(args): """ %prog merge map1 map2 map3 ... Convert csv maps to bed format. Each input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 scaffold_759,81336,1,9.7 """ p = OptionParser(merge.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = CSVMapLine(row, mapname=mapname) if m.cm < 0: logging.error("Ignore marker with negative genetic distance") print >> sys.stderr, row.strip() else: b.append(BedLine(m.bedline)) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option( "--type", dest="type", default="gene", help= "Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def mergebed(args): """ %prog mergebed map1.bed map2.bed map3.bed ... Combine bed maps to bed format, adding the map name. """ p = OptionParser(mergebed.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = BedLine(row) m.accn = "{0}-{1}".format(mapname, m.accn) m.extra = ["{0}:{1}".format(m.seqid, m.start)] b.append(m) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option("--type", dest="type", default="gene", help="Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option( "--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan)**.5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [ block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+" ] bargs = [ block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation ] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [ astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id ] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format( i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option( "--closest", default=False, action="store_true", help="In case of no gaps, use closest", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option( "--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option("--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option("--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan) ** .5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+"] bargs = [block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print(b, file=fw) continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print("A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))), file=sys.stderr) for pf, count in total_counts.items(): print("{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid), file=sys.stderr) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print("{0:>9} variants: {1}".format(l, h), file=sys.stderr) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print("{0},{1}".format(k, v), file=fw) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print("No deviation: {0}".format(percentage(zeros, total)), file=sys.stderr) print(" Within 20bp: {0}".format(percentage(within_20, total)), file=sys.stderr)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[ tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label ]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option("--closest", default=False, action="store_true", help="In case of no gaps, use closest [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print >> fw, b continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print >> sys.stderr, \ "A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))) for pf, count in total_counts.items(): print >> sys.stderr, "{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print >> sys.stderr, "{0:>9} variants: {1}".format(l, h) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print >> fw, "{0},{1}".format(k, v) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print >> sys.stderr, "No deviation: {0}".format(percentage(zeros, total)) print >> sys.stderr, " Within 20bp: {0}".format(percentage(within_20, total))