def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print(object, start_id, end_id, file=sys.stderr) if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append( BedLine( "\t".join( str(x) for x in (object, 0, bbsizes[object], object, 1000, "+") ) ) ) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.print_to_file(sorted=True)
def mergebed(args): """ %prog mergebed map1.bed map2.bed map3.bed ... Combine bed maps to bed format, adding the map name. """ p = OptionParser(mergebed.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = BedLine(row) m.accn = "{0}-{1}".format(mapname, m.accn) m.extra = ["{0}:{1}".format(m.seqid, m.start)] b.append(m) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def location(args): """ %prog location bedfile fastafile Given SNP locations, summarize the locations in the sequences. For example, find out if there are more 3`-SNPs than 5`-SNPs. """ from jcvi.formats.bed import BedLine from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(location.__doc__) p.add_option( "--dist", default=100, type="int", help="Distance cutoff to call 5` and 3` [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args dist = opts.dist sizes = Sizes(fastafile).mapping fp = open(bedfile) fiveprime = threeprime = total = 0 percentages = [] for row in fp: b = BedLine(row) pos = b.start size = sizes[b.seqid] if pos < dist: fiveprime += 1 if size - pos < dist: threeprime += 1 total += 1 percentages.append(100 * pos / size) m = "Five prime (within {0}bp of start codon): {1}\n".format( dist, fiveprime) m += "Three prime (within {0}bp of stop codon): {1}\n".format( dist, threeprime) m += "Total: {0}".format(total) print(m, file=sys.stderr) bins = 10 title = "Locations within the gene [0=Five-prime, 100=Three-prime]" stem_leaf_plot(percentages, 0, 100, bins, title=title)
def merge(args): """ %prog merge map1 map2 map3 ... Convert csv maps to bed format. Each input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 scaffold_759,81336,1,9.7 """ p = OptionParser(merge.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = CSVMapLine(row, mapname=mapname) if m.cm < 0: logging.error("Ignore marker with negative genetic distance") print >> sys.stderr, row.strip() else: b.append(BedLine(m.bedline)) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def qc(args): """ %prog qc prefix Expects data files including: 1. `prefix.bedpe` draws Bezier curve between paired reads 2. `prefix.sizes` draws length of the contig/scaffold 3. `prefix.gaps.bed` mark the position of the gaps in sequence 4. `prefix.bed.coverage` plots the base coverage 5. `prefix.pairs.bed.coverage` plots the clone coverage See assembly.coverage.posmap() for the generation of these files. """ from jcvi.graphics.glyph import Bezier p = OptionParser(qc.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) prefix, = args scf = prefix # All these files *must* be present in the current folder bedpefile = prefix + ".bedpe" fastafile = prefix + ".fasta" sizesfile = prefix + ".sizes" gapsbedfile = prefix + ".gaps.bed" bedfile = prefix + ".bed" bedpefile = prefix + ".bedpe" pairsbedfile = prefix + ".pairs.bed" sizes = Sizes(fastafile).mapping size = sizes[scf] fig = plt.figure(1, (8, 5)) root = fig.add_axes([0, 0, 1, 1]) # the scaffold root.add_patch(Rectangle((.1, .15), .8, .03, fc='k')) # basecoverage and matecoverage ax = fig.add_axes([.1, .45, .8, .45]) bins = 200 # Smooth the curve basecoverage = Coverage(bedfile, sizesfile) matecoverage = Coverage(pairsbedfile, sizesfile) x, y = basecoverage.get_plot_data(scf, bins=bins) baseline, = ax.plot(x, y, 'g-') x, y = matecoverage.get_plot_data(scf, bins=bins) mateline, = ax.plot(x, y, 'r-') legends = ("Base coverage", "Mate coverage") leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(.5) ax.set_xlim(0, size) # draw the read pairs fp = open(bedpefile) pairs = [] for row in fp: scf, astart, aend, scf, bstart, bend, clonename = row.split() astart, bstart = int(astart), int(bstart) aend, bend = int(aend), int(bend) start = min(astart, bstart) + 1 end = max(aend, bend) pairs.append((start, end)) bpratio = .8 / size cutoff = 1000 # inserts smaller than this are not plotted # this convert from base => x-coordinate pos = lambda x: (.1 + x * bpratio) ypos = .15 + .03 for start, end in pairs: dist = end - start if dist < cutoff: continue dist = min(dist, 10000) # 10Kb == .25 canvas height height = .25 * dist / 10000 xstart = pos(start) xend = pos(end) p0 = (xstart, ypos) p1 = (xstart, ypos + height) p2 = (xend, ypos + height) p3 = (xend, ypos) Bezier(root, p0, p1, p2, p3) # gaps on the scaffold fp = open(gapsbedfile) for row in fp: b = BedLine(row) start, end = b.start, b.end xstart = pos(start) xend = pos(end) root.add_patch(Rectangle((xstart, .15), xend - xstart, .03, fc='w')) root.text(.5, .1, scf, color='b', ha="center") warn_msg = "Only the inserts > {0}bp are shown".format(cutoff) root.text(.5, .1, scf, color='b', ha="center") root.text(.5, .05, warn_msg, color='gray', ha="center") # clean up and output set_human_base_axis(ax) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = prefix + ".pdf" savefig(figname, dpi=300)
def bed_to_bedpe(bedfile, bedpefile, pairsbedfile=None, matesfile=None, ca=False, strand=False): """ This converts the bedfile to bedpefile, assuming the reads are from CA. """ fp = must_open(bedfile) fw = must_open(bedpefile, "w") if pairsbedfile: fwpairs = must_open(pairsbedfile, "w") clones = defaultdict(list) for row in fp: b = BedLine(row) name = b.accn clonename = clone_name(name, ca=ca) clones[clonename].append(b) if matesfile: fp = open(matesfile) libraryline = next(fp) # 'library bes 37896 126916' lib, name, smin, smax = libraryline.split() assert lib == "library" smin, smax = int(smin), int(smax) logging.debug("Happy mates for lib {0} fall between {1} - {2}".format( name, smin, smax)) nbedpe = 0 nspan = 0 for clonename, blines in clones.items(): nlines = len(blines) if nlines == 2: a, b = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = b.seqid, b.start, b.end outcols = [ aseqid, astart - 1, aend, bseqid, bstart - 1, bend, clonename ] if strand: outcols.extend([0, a.strand, b.strand]) print("\t".join(str(x) for x in outcols), file=fw) nbedpe += 1 elif nlines == 1: (a, ) = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = 0, 0, 0 else: # More than two lines per pair pass if pairsbedfile: start = min(astart, bstart) if bstart > 0 else astart end = max(aend, bend) if bend > 0 else aend if aseqid != bseqid: continue span = end - start + 1 if (not matesfile) or (smin <= span <= smax): print( "\t".join( str(x) for x in (aseqid, start - 1, end, clonename)), file=fwpairs, ) nspan += 1 fw.close() logging.debug("A total of {0} bedpe written to `{1}`.".format( nbedpe, bedpefile)) if pairsbedfile: fwpairs.close() logging.debug("A total of {0} spans written to `{1}`.".format( nspan, pairsbedfile))
def link(args): """ %prog link bedfile fastafile Construct contig links based on bed file. Use --prefix to limit the links between contigs that start with the same prefix_xxx. """ p = OptionParser(link.__doc__) p.add_option("--insert", type="int", default=0, help="Mean insert size [default: estimate from data]") p.add_option("--cutoff", type="int", default=0, help="Largest distance expected for linkage " + \ "[default: estimate from data]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]" ) p.add_option( "--debug", dest="debug", default=False, action="store_true", help="Print verbose info when checking mates [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args debug = opts.debug cutoff = opts.cutoff sizes = Sizes(fastafile) cutoffopt = "--cutoff={0}".format(cutoff) mateorientationopt = '--mateorientation=+-' bedfile, (meandist, stdev, p0, p1, p2) = \ pairs([bedfile, cutoffopt, mateorientationopt]) maxcutoff = cutoff or p2 insert = opts.insert or p0 logging.debug("Mate hangs must be <= {0}, --cutoff to override".\ format(maxcutoff)) rs = lambda x: x.accn[:-1] fp = open(bedfile) linksfile = bedfile.rsplit(".", 1)[0] + ".links" fw = open(linksfile, "w") for a, b in pairwise(fp): """ Criteria for valid contig edge 1. for/rev do not mapping to the same scaffold (useful for linking) 2. assuming innie (outie must be flipped first), order the contig pair 3. calculate sequence hangs, valid hangs are smaller than insert size """ a, b = BedLine(a), BedLine(b) if rs(a) != rs(b): continue pe = rs(a) # Intra-contig links if a.seqid == b.seqid: continue # Use --prefix to limit the links between seqids with the same prefix # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002 if opts.prefix: aprefix = a.seqid.split("_")[0] bprefix = b.seqid.split("_")[0] if aprefix != bprefix: continue cl = ContigLink(a, b, insert=insert, cutoff=maxcutoff) if cl.flip_innie(sizes, debug=debug): print >> fw, "\t".join((pe, str(cl)))
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option( "--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def bedline(self): score = "1000" if self.score == '.' else self.score row = "\t".join((self.seqid, str(self.start - 1), str(self.end), self.accn, score, self.strand)) return BedLine(row)
def __init__(self, fh): line = fh.readline().strip() self.id = BedLine(line).accn if line else None
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == "-": b.extra[1] = b.strand = "-" if b.strand == "+" else "+" bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def allocate(self, info, chr, start_id, end_id, id_table): start_bp = info[0].start end_bp = info[-1].end current_chr = chr_number(chr) needed = info assert end_id > start_id, \ "end ({0}) > start ({1})".format(end_id, start_id) spots = end_id - start_id - 1 available = [ x for x in xrange(start_id + 1, end_id) if (current_chr, x) not in self.black ] message = "{0} need {1} ids, has {2} spots ({3} available)".\ format(chr, len(needed), spots, len(available)) start_gene = gene_name(current_chr, start_id, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) end_gene = gene_name(current_chr, end_id, prefix=self.prefix, pad0=self.pad0, uc=self.uc) message += " between {0} - {1}\n".format(start_gene, end_gene) assert end_bp > start_bp b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp)) cmd = "echo '{0}' |".format(b) cmd += " intersectBed -a {0} -b stdin".format(self.gapfile) gaps = list(BedLine(x) for x in popen(cmd, debug=False)) ngaps = len(gaps) gapsexpanded = [] GeneDensity = 10000. # assume 10Kb per gene for gap in gaps: gap_bp = int(gap.score) gap_ids = int(round(gap_bp / GeneDensity)) gapsexpanded += [gap] * gap_ids lines = sorted(info + gapsexpanded, key=lambda x: x.start) message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\ format(start_bp, end_bp, ngaps, len(lines)) needed = lines stride = Stride(needed, available) conf = stride.conf message += " stride: {0}".format(conf) print >> sys.stderr, message nneeded = len(needed) if conf is None: # prefix rule - prepend version number for spills magic = 400000 # version 4 firstdigit = 100000 step = 10 # stride for the prefixed ids rank = start_id + magic if rank > magic + firstdigit: rank -= firstdigit available = [] while len(available) != nneeded: rank += step if (current_chr, rank) in self.black: # avoid blacklisted ids continue available.append(rank) else: # follow the best stride available = stride.available if start_id == 0: # follow right flank at start of chr available = available[-nneeded:] else: # follow left flank otherwise available = available[:nneeded] # Finally assign the ids assert len(needed) == len(available) for b, rank in zip(needed, available): name = gene_name(current_chr, rank, prefix=self.prefix, \ pad0=self.pad0, uc=self.uc) print >> sys.stderr, "\t".join((str(b), name)) id_table[b.accn] = name self.black.add((current_chr, rank)) print >> sys.stderr
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option( "--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan)**.5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [ block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+" ] bargs = [ block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation ] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [ astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id ] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format( i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)