def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option("--flank", default=5000, type="int", help="Get the seq of size on two ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) goodfasta, = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print >> fw, "\t".join(str(x) for x in \ (bac, 0, min(flank, size), bac + "L")) print >> fw, "\t".join(str(x) for x in \ (bac, max(size - flank, 0), size, bac + "R")) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option( "--flank", default=5000, type="int", help="Get the seq of size on two ends", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (goodfasta,) = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")), file=fw) print( "\t".join(str(x) for x in (bac, max(size - flank, 0), size, bac + "R")), file=fw, ) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def fasta(args): """ %prog fasta map.out scaffolds.fasta Extract marker sequences based on map. """ from jcvi.formats.sizes import Sizes p = OptionParser(fasta.__doc__) p.add_option( "--extend", default=1000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mapout, sfasta = args Flank = opts.extend pf = mapout.split(".")[0] mapbed = pf + ".bed" bm = BinMap(mapout) bm.print_to_bed(mapbed) bed = Bed(mapbed, sorted=False) markersbed = pf + ".markers.bed" fw = open(markersbed, "w") sizes = Sizes(sfasta).mapping for b in bed: accn = b.accn scf, pos = accn.split(".") pos = int(pos) start = max(0, pos - Flank) end = min(pos + Flank, sizes[scf]) print("\t".join(str(x) for x in (scf, start, end, accn)), file=fw) fw.close() fastaFromBed(markersbed, sfasta, name=True)
def flanks(args): """ %prog flanks gaps.bed fastafile Create sequences flanking the gaps. """ p = OptionParser(flanks.__doc__) p.add_option( "--extend", default=2000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, fastafile = args Ext = opts.extend sizes = Sizes(fastafile).mapping bed = Bed(gapsbed) pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for i, b in enumerate(bed): seqid = b.seqid gapname = b.accn size = sizes[seqid] prev_b = bed[i - 1] if i > 0 else None next_b = bed[i + 1] if i + 1 < len(bed) else None if prev_b and prev_b.seqid != seqid: prev_b = None if next_b and next_b.seqid != seqid: next_b = None start = prev_b.end + 1 if prev_b else 1 start, end = max(start, b.start - Ext), b.start - 1 print("\t".join( str(x) for x in (b.seqid, start - 1, end, gapname + "L")), file=fw) end = next_b.start - 1 if next_b else size start, end = b.end + 1, min(end, b.end + Ext) print("\t".join( str(x) for x in (b.seqid, start - 1, end, gapname + "R")), file=fw) fw.close() extfasta = fastaFromBed(extbed, fastafile, name=True) return extbed, extfasta
def fill(args): """ %prog fill gaps.bed bad.fasta Perform gap filling of one assembly (bad) using sequences from another. """ p = OptionParser(fill.__doc__) p.add_option( "--extend", default=2000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, badfasta = args Ext = opts.extend gapdist = 2 * Ext + 1 # This is to prevent to replacement ranges intersect gapsbed = mergeBed(gapsbed, d=gapdist, nms=True) bed = Bed(gapsbed) sizes = Sizes(badfasta).mapping pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for b in bed: gapname = b.accn start, end = max(0, b.start - Ext - 1), b.start - 1 print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")), file=fw) start, end = b.end, min(sizes[b.seqid], b.end + Ext) print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")), file=fw) fw.close() fastaFromBed(extbed, badfasta, name=True)
def fasta(args): """ %prog fasta map.out scaffolds.fasta Extract marker sequences based on map. """ from jcvi.formats.sizes import Sizes p = OptionParser(fasta.__doc__) p.add_option("--extend", default=1000, type="int", help="Extend seq flanking the gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mapout, sfasta = args Flank = opts.extend pf = mapout.split(".")[0] mapbed = pf + ".bed" bm = BinMap(mapout) bm.print_to_bed(mapbed) bed = Bed(mapbed, sorted=False) markersbed = pf + ".markers.bed" fw = open(markersbed, "w") sizes = Sizes(sfasta).mapping for b in bed: accn = b.accn scf, pos = accn.split(".") pos = int(pos) start = max(0, pos - Flank) end = min(pos + Flank, sizes[scf]) print >> fw, "\t".join(str(x) for x in \ (scf, start, end, accn)) fw.close() fastaFromBed(markersbed, sfasta, name=True)
def fill(args): """ %prog fill gaps.bed bad.fasta Perform gap filling of one assembly (bad) using sequences from another. """ p = OptionParser(fill.__doc__) p.add_option("--extend", default=2000, type="int", help="Extend seq flanking the gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, badfasta = args Ext = opts.extend gapdist = 2 * Ext + 1 # This is to prevent to replacement ranges intersect gapsbed = mergeBed(gapsbed, d=gapdist, nms=True) bed = Bed(gapsbed) sizes = Sizes(badfasta).mapping pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for b in bed: gapname = b.accn start, end = max(0, b.start - Ext - 1), b.start - 1 print >> fw, "\t".join(str(x) for x in \ (b.seqid, start, end, gapname + "L")) start, end = b.end, min(sizes[b.seqid], b.end + Ext) print >> fw, "\t".join(str(x) for x in \ (b.seqid, start, end, gapname + "R")) fw.close() fastaFromBed(extbed, badfasta, name=True)
def flanks(args): """ %prog flanks gaps.bed fastafile Create sequences flanking the gaps. """ p = OptionParser(flanks.__doc__) p.add_option("--extend", default=2000, type="int", help="Extend seq flanking the gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, fastafile = args Ext = opts.extend sizes = Sizes(fastafile).mapping bed = Bed(gapsbed) pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for i, b in enumerate(bed): seqid = b.seqid gapname = b.accn size = sizes[seqid] prev_b = bed[i - 1] if i > 0 else None next_b = bed[i + 1] if i + 1 < len(bed) else None if prev_b and prev_b.seqid != seqid: prev_b = None if next_b and next_b.seqid != seqid: next_b = None start = prev_b.end + 1 if prev_b else 1 start, end = max(start, b.start - Ext), b.start - 1 print >> fw, "\t".join(str(x) for x in \ (b.seqid, start - 1, end, gapname + "L")) end = next_b.start - 1 if next_b else size start, end = b.end + 1, min(end, b.end + Ext) print >> fw, "\t".join(str(x) for x in \ (b.seqid, start - 1, end, gapname + "R")) fw.close() extfasta = fastaFromBed(extbed, fastafile, name=True) return extbed, extfasta
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format( mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, rclip=rclip, maxsize=maxsize ) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count("n") + x.seq.count("N") exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug( "Ignore {0} updates because of decreasing quality.".format(len(exclude)) ) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option( "--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing", ) p.add_option( "--flank", default=2000, type="int", help="Get the seq of size on two ends", ) p.add_option( "--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print( "\t".join( str(x) for x in (chr, leftb.start, rightb.end, gb.accn) ), file=fwp, ) print(leftb, file=fwe) print(gb, file=fwe) print(rightb, file=fwe) print( "L:{0} R:{1} [{2}]".format( distance_to_left, distance_to_right, label ), file=fwe, ) print(gb.accn, file=fw_ids) continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def annotate(args): """ %prog annotate agpfile gaps.linkage.bed assembly.fasta Annotate AGP file with linkage info of `paired-end` or `map`. File `gaps.linkage.bed` is generated by assembly.gaps.estimate(). """ from jcvi.formats.agp import AGP, bed, tidy p = OptionParser(annotate.__doc__) p.add_option("--minsize", default=200, help="Smallest component size [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, linkagebed, assemblyfasta = args linkagebed = Bed(linkagebed) spannedgaps = set() for b in linkagebed: score = int(b.score) if score == 0: spannedgaps.add((b.accn, b.start, b.end)) agp = AGP(agpfile) newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp" newagp = open(newagpfile, "w") contig_id = 0 minsize = opts.minsize for a in agp: if not a.is_gap: cs = a.component_span if cs < minsize: a.is_gap = True a.component_type = "N" a.gap_length = cs a.gap_type = "scaffold" a.linkage = "yes" a.linkage_evidence = [] else: contig_id += 1 a.component_id = "contig{0:04d}".format(contig_id) a.component_beg = 1 a.component_end = cs a.component_type = "W" print >> newagp, a continue gapinfo = (a.object, a.object_beg, a.object_end) gaplen = a.gap_length if gaplen == 100 and gapinfo not in spannedgaps: a.component_type = "U" tag = "map" else: tag = "paired-ends" a.linkage_evidence.append(tag) print >> newagp, a newagp.close() logging.debug("Annotated AGP written to `{0}`.".format(newagpfile)) contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed" bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed]) contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True) tidy([newagpfile, contigfasta])
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)