def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize : tsize + ksize] bcount = b.count() print >> fw, "\t".join(str(x) for x in (name, bcount)) tsize += ksize
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option( "--flank", default=5000, type="int", help="Get the seq of size on two ends", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (goodfasta,) = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")), file=fw) print( "\t".join(str(x) for x in (bac, max(size - flank, 0), size, bac + "R")), file=fw, ) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option("--flank", default=5000, type="int", help="Get the seq of size on two ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) goodfasta, = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print >> fw, "\t".join(str(x) for x in \ (bac, 0, min(flank, size), bac + "L")) print >> fw, "\t".join(str(x) for x in \ (bac, max(size - flank, 0), size, bac + "R")) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize:tsize + ksize] bcount = b.count() print("\t".join(str(x) for x in (name, bcount)), file=fw) tsize += ksize
def bed(args): """ %prog bed binfile fastafile Write bed files where the bases have at least certain depth. """ p = OptionParser(bed.__doc__) p.add_option( "-o", dest="output", default="stdout", help="Output file name", ) p.add_option( "--cutoff", dest="cutoff", default=10, type="int", help="Minimum read depth to report intervals", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) binfile, fastafile = args fw = must_open(opts.output, "w") cutoff = opts.cutoff assert cutoff >= 0, "Need non-negative cutoff" b = BinFile(binfile) ar = b.array fastasize, sizes, offsets = get_offsets(fastafile) s = Sizes(fastafile) for ctg, ctglen in s.iter_sizes(): offset = offsets[ctg] subarray = ar[offset:offset + ctglen] key = lambda x: x[1] >= cutoff for tf, array_elements in groupby(enumerate(subarray), key=key): array_elements = list(array_elements) if not tf: continue # 0-based system => 1-based system start = array_elements[0][0] + 1 end = array_elements[-1][0] + 1 mean_depth = sum([x[1] for x in array_elements]) / len(array_elements) mean_depth = int(mean_depth) name = "na" print( "\t".join( str(x) for x in (ctg, start - 1, end, name, mean_depth)), file=fw, )
def __init__(self, filename, fastafile): super(EvidenceFile, self).__init__(filename) sz = Sizes(fastafile) sizes = [None] # tig-list starts at 1 for name, size in sz.iter_sizes(): sizes.append((name, size)) self.sizes = sizes self.sz = sz.mapping self.scf = {}
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False): from jcvi.formats.agp import order_to_agp logging.debug(str(g)) g.write("graph.txt") # g.draw("graph.pdf") paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if verbose: print(m) print(oo) npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug( "Graph decomposed to {0} paths with {1} components.".format(npaths, ntigs) ) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = nscaffolded = nexcluded = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: nscaffolded += 1 continue if ctg in exclude: nexcluded += 1 continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug( "scaffolded={} excluded={} singletons={}".format( nscaffolded, nexcluded, nsingletons ) ) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False): from jcvi.formats.agp import order_to_agp logging.debug(str(g)) g.write("graph.txt") #g.draw("graph.pdf") paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = nscaffolded = nexcluded = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: nscaffolded += 1 continue if ctg in exclude: nexcluded += 1 continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("scaffolded={} excluded={} singletons={}".\ format(nscaffolded, nexcluded, nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs length. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import pandas as pd import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=1000000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") p.add_option("--color", default='m', help="Color of the data points") p.add_option("--kind", default="scatter", choices=("scatter", "reg", "resid", "kde", "hex"), help="Kind of plot to draw") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov.get(ctg, 0) if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) df = pd.DataFrame() xlab, ylab = "Length", "Coverage of depth (X)" df[xlab] = x df[ylab] = y sns.jointplot(xlab, ylab, kind=opts.kind, data=df, xlim=(0, maxsize), ylim=(0, maxcov), stat_func=None, edgecolor="w", color=opts.color) figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def bed(args): """ %prog bed binfile fastafile Write bed files where the bases have at least certain depth. """ p = OptionParser(bed.__doc__) p.add_option("-o", dest="output", default="stdout", help="Output file name [default: %default]") p.add_option("--cutoff", dest="cutoff", default=10, type="int", help="Minimum read depth to report intervals [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) binfile, fastafile = args fw = must_open(opts.output, "w") cutoff = opts.cutoff assert cutoff >= 0, "Need non-negative cutoff" b = BinFile(binfile) ar = b.array fastasize, sizes, offsets = get_offsets(fastafile) s = Sizes(fastafile) for ctg, ctglen in s.iter_sizes(): offset = offsets[ctg] subarray = ar[offset:offset + ctglen] key = lambda x: x[1] >= cutoff for tf, array_elements in groupby(enumerate(subarray), key=key): array_elements = list(array_elements) if not tf: continue # 0-based system => 1-based system start = array_elements[0][0] + 1 end = array_elements[-1][0] + 1 mean_depth = sum([x[1] for x in array_elements]) / \ len(array_elements) mean_depth = int(mean_depth) name = "na" print >> fw, "\t".join(str(x) for x in (ctg, \ start - 1, end, name, mean_depth))
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs lenght. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=100000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov[ctg] if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) sns.jointplot(x, y, kind="kde") figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)
def stack(args): """ %prog stack fastafile Create landscape plots that show the amounts of genic sequences, and repetitive sequences along the chromosomes. """ p = OptionParser(stack.__doc__) p.add_option("--top", default=10, type="int", help="Draw the first N chromosomes") p.add_option( "--stacks", default="Exons,Introns,DNA_transposons,Retrotransposons", help="Features to plot in stackplot", ) p.add_option("--switch", help="Change chr names based on two-column file") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args top = opts.top window, shift, subtract, merge = check_window_options(opts) switch = opts.switch if switch: switch = DictFile(opts.switch) stacks = opts.stacks.split(",") bedfiles = get_beds(stacks) binfiles = get_binfiles(bedfiles, fastafile, shift, subtract=subtract, merge=merge) sizes = Sizes(fastafile) s = list(sizes.iter_sizes())[:top] maxl = max(x[1] for x in s) margin = 0.08 inner = 0.02 # y distance between tracks pf = fastafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Gauge ratio = draw_gauge(root, margin, maxl) # Per chromosome yinterval = (1 - 2 * margin) / (top + 1) xx = margin yy = 1 - margin for chr, clen in s: yy -= yinterval xlen = clen / ratio cc = chr if "_" in chr: ca, cb = chr.split("_") cc = ca[0].upper() + cb if switch and cc in switch: cc = "\n".join((cc, "({0})".format(switch[cc]))) root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) nbins = clen / shift if clen % shift: nbins += 1 stackplot(ax, binfiles, nbins, palette, chr, window, shift) root.text(xx - 0.04, yy + 0.5 * (yinterval - inner), cc, ha="center", va="center") ax.set_xlim(0, nbins) ax.set_ylim(0, 1) ax.set_axis_off() # Legends yy -= yinterval xx = margin for b, p in zip(bedfiles, palette): b = b.rsplit(".", 1)[0].replace("_", " ") b = Registration.get(b, b) root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0)) xx += 2 * inner root.text(xx, yy, b, size=13) xx += len(b) * 0.012 + inner root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == "-": b.extra[1] = b.strand = "-" if b.strand == "+" else "+" bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def stack(args): """ %prog stack fastafile Create landscape plots that show the amounts of genic sequences, and repetitive sequences along the chromosomes. """ p = OptionParser(stack.__doc__) p.add_option("--top", default=10, type="int", help="Draw the first N chromosomes [default: %default]") p.add_option("--stacks", default="Exons,Introns,DNA_transposons,Retrotransposons", help="Features to plot in stackplot [default: %default]") p.add_option("--switch", help="Change chr names based on two-column file [default: %default]") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args top = opts.top window, shift, subtract = check_window_options(opts) switch = opts.switch if switch: switch = DictFile(opts.switch) stacks = opts.stacks.split(",") bedfiles = get_beds(stacks) binfiles = get_binfiles(bedfiles, fastafile, shift, subtract=subtract) sizes = Sizes(fastafile) s = list(sizes.iter_sizes())[:top] maxl = max(x[1] for x in s) margin = .08 inner = .02 # y distance between tracks pf = fastafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Gauge ratio = draw_gauge(root, margin, maxl) # Per chromosome yinterval = (1 - 2 * margin) / (top + 1) xx = margin yy = 1 - margin for chr, clen in s: yy -= yinterval xlen = clen / ratio cc = chr if "_" in chr: ca, cb = chr.split("_") cc = ca[0].upper() + cb if switch and cc in switch: cc = "\n".join((cc, "({0})".format(switch[cc]))) root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) nbins = clen / shift if clen % shift: nbins += 1 stackplot(ax, binfiles, nbins, palette, chr, window, shift) root.text(xx - .04, yy + .5 * (yinterval - inner), cc, ha="center", va="center") ax.set_xlim(0, nbins) ax.set_ylim(0, 1) ax.set_axis_off() # Legends yy -= yinterval xx = margin for b, p in zip(bedfiles, palette): b = b.rsplit(".", 1)[0].replace("_", " ") b = Registration.get(b, b) root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0)) xx += 2 * inner root.text(xx, yy, b, size=13) xx += len(b) * .012 + inner root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))