def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) bb = opts.backbone pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) is_bb = lambda x: x.startswith(bb) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) id = "{0}:{1}-{2}".format(chr, start, end) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print(object, start_id, end_id, file=sys.stderr) if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append( BedLine( "\t".join( str(x) for x in (object, 0, bbsizes[object], object, 1000, "+") ) ) ) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.print_to_file(sorted=True)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.sort(key=newbed.nullkey) newbed.print_to_file()
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print >> sys.stderr, object, start_id, end_id if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append(BedLine("\t".join(str(x) for x in \ (object, 0, bbsizes[object], object, 1000, "+")))) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def merge(args): """ %prog merge map1 map2 map3 ... Convert csv maps to bed format. Each input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 scaffold_759,81336,1,9.7 """ p = OptionParser(merge.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = CSVMapLine(row, mapname=mapname) if m.cm < 0: logging.error("Ignore marker with negative genetic distance") print >> sys.stderr, row.strip() else: b.append(BedLine(m.bedline)) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option( "--type", dest="type", default="gene", help= "Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def mergebed(args): """ %prog mergebed map1.bed map2.bed map3.bed ... Combine bed maps to bed format, adding the map name. """ p = OptionParser(mergebed.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = BedLine(row) m.accn = "{0}-{1}".format(mapname, m.accn) m.extra = ["{0}:{1}".format(m.seqid, m.start)] b.append(m) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def eject(args): """ %prog eject candidates.bed chr.fasta Eject scaffolds from assembly, using the range identified by closest(). """ p = OptionParser(eject.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) candidates, chrfasta = args sizesfile = Sizes(chrfasta).filename cbedfile = complementBed(candidates, sizesfile) cbed = Bed(cbedfile) for b in cbed: b.accn = b.seqid b.score = 1000 b.strand = '+' cbed.print_to_file()
def eject(args): """ %prog eject candidates.bed chr.fasta Eject scaffolds from assembly, using the range identified by closest(). """ p = OptionParser(eject.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) candidates, chrfasta = args sizesfile = Sizes(chrfasta).filename cbedfile = complementBed(candidates, sizesfile) cbed = Bed(cbedfile) for b in cbed: b.accn = b.seqid b.score = 1000 b.strand = "+" cbed.print_to_file()
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option("--type", dest="type", default="gene", help="Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format( mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == "-": b.extra[1] = b.strand = "-" if b.strand == "+" else "+" bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option( "--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option( "--closest", default=False, action="store_true", help="In case of no gaps, use closest", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option("--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option("--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan) ** .5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+"] bargs = [block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print(b, file=fw) continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print("A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))), file=sys.stderr) for pf, count in total_counts.items(): print("{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid), file=sys.stderr) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print("{0:>9} variants: {1}".format(l, h), file=sys.stderr) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print("{0},{1}".format(k, v), file=fw) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print("No deviation: {0}".format(percentage(zeros, total)), file=sys.stderr) print(" Within 20bp: {0}".format(percentage(within_20, total)), file=sys.stderr)
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = { (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), } def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[ tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label ]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, rclip=rclip, maxsize=maxsize ) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count("n") + x.seq.count("N") exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug( "Ignore {0} updates because of decreasing quality.".format(len(exclude)) ) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print >> fw, b continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print >> sys.stderr, \ "A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))) for pf, count in total_counts.items(): print >> sys.stderr, "{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print >> sys.stderr, "{0:>9} variants: {1}".format(l, h) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print >> fw, "{0},{1}".format(k, v) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print >> sys.stderr, "No deviation: {0}".format(percentage(zeros, total)) print >> sys.stderr, " Within 20bp: {0}".format(percentage(within_20, total))
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option("--closest", default=False, action="store_true", help="In case of no gaps, use closest [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option( "--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan)**.5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [ block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+" ] bargs = [ block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation ] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [ astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id ] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format( i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)