def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) bb = opts.backbone pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) is_bb = lambda x: x.startswith(bb) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) id = "{0}:{1}-{2}".format(chr, start, end) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print(object, start_id, end_id, file=sys.stderr) if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append( BedLine( "\t".join( str(x) for x in (object, 0, bbsizes[object], object, 1000, "+") ) ) ) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print >> sys.stderr, object, start_id, end_id if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append(BedLine("\t".join(str(x) for x in \ (object, 0, bbsizes[object], object, 1000, "+")))) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, rclip=rclip, maxsize=maxsize ) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count("n") + x.seq.count("N") exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug( "Ignore {0} updates because of decreasing quality.".format(len(exclude)) ) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == "-": b.extra[1] = b.strand = "-" if b.strand == "+" else "+" bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)