def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join(str(x) for x in ( chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id )) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp): agp = AGP(agpfile) scaffolds_seen = set(x.component_id for x in agp) sizes = Sizes(scaffolds).mapping fwagp = must_open(unplaced_agp, "w") for s in sorted(sizes.keys()): if s in scaffolds_seen: continue order_to_agp(s, [(s, "?")], sizes, fwagp) logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)
def neighbor(args): """ %prog neighbor agpfile componentID Check overlaps of a particular component in agpfile. """ p = OptionParser(neighbor.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, componentID = args fastadir = "fasta" cmd = "grep" cmd += " --color -C2 {0} {1}".format(componentID, agpfile) sh(cmd) agp = AGP(agpfile) aorder = agp.order if not componentID in aorder: print( "Record {0} not present in `{1}`.".format(componentID, agpfile), file=sys.stderr, ) return i, c = aorder[componentID] north, south = agp.getNorthSouthClone(i) if not north.isCloneGap: ar = [north.component_id, componentID, "--dir=" + fastadir] if north.orientation == "-": ar += ["--qreverse"] overlap(ar) if not south.isCloneGap: ar = [componentID, south.component_id, "--dir=" + fastadir] if c.orientation == "-": ar += ["--qreverse"] overlap(ar)
def plotall(xargs): """ %prog plotall input.bed Plot the matchings between the reconstructed pseudomolecules and the maps. This command will plot each reconstructed object (non-singleton). """ p = OptionParser(plotall.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(xargs, figsize="10x6") if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".agp" agp = AGP(agpfile) objects = [ob for ob, lines in agp.iter_object() if len(lines) > 1] for seqid in sorted(objects): plot(xargs + [seqid])
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length [default: %default]") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".\ format(len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print >> sys.stderr, "Remove {0}".format(c) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def annotate(args): """ %prog annotate agpfile gaps.linkage.bed assembly.fasta Annotate AGP file with linkage info of `paired-end` or `map`. File `gaps.linkage.bed` is generated by assembly.gaps.estimate(). """ from jcvi.formats.agp import AGP, bed, tidy p = OptionParser(annotate.__doc__) p.add_option("--minsize", default=200, help="Smallest component size [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, linkagebed, assemblyfasta = args linkagebed = Bed(linkagebed) spannedgaps = set() for b in linkagebed: score = int(b.score) if score == 0: spannedgaps.add((b.accn, b.start, b.end)) agp = AGP(agpfile) newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp" newagp = open(newagpfile, "w") contig_id = 0 minsize = opts.minsize for a in agp: if not a.is_gap: cs = a.component_span if cs < minsize: a.is_gap = True a.component_type = "N" a.gap_length = cs a.gap_type = "scaffold" a.linkage = "yes" a.linkage_evidence = [] else: contig_id += 1 a.component_id = "contig{0:04d}".format(contig_id) a.component_beg = 1 a.component_end = cs a.component_type = "W" print >> newagp, a continue gapinfo = (a.object, a.object_beg, a.object_end) gaplen = a.gap_length if gaplen == 100 and gapinfo not in spannedgaps: a.component_type = "U" tag = "map" else: tag = "paired-ends" a.linkage_evidence.append(tag) print >> newagp, a newagp.close() logging.debug("Annotated AGP written to `{0}`.".format(newagpfile)) contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed" bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed]) contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True) tidy([newagpfile, contigfasta])
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >> sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >> sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >> sys.stderr, "* Does not allow backward" \ " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >> sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, \ delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, \ delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, \ delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])
def estimategaps(args): """ %prog estimategaps input.bed Estimate sizes of inter-scaffold gaps. The AGP file generated by path() command has unknown gap sizes with a generic number of Ns (often 100 Ns). The AGP file `input.chr.agp` will be modified in-place. """ p = OptionParser(estimategaps.__doc__) p.add_option("--minsize", default=100, type="int", help="Minimum gap size") p.add_option("--maxsize", default=500000, type="int", help="Maximum gap size") p.add_option("--links", default=10, type="int", help="Only use linkage grounds with matchings more than") p.set_verbose(help="Print details for each gap calculation") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".chr.agp" bedfile = pf + ".lifted.bed" cc = Map(bedfile, scaffold_info=True) agp = AGP(agpfile) minsize, maxsize = opts.minsize, opts.maxsize links = opts.links verbose = opts.verbose outagpfile = pf + ".estimategaps.agp" fw = must_open(outagpfile, "w") for ob, components in agp.iter_object(): components = list(components) s = Scaffold(ob, cc) mlg_counts = s.mlg_counts gaps = [x for x in components if x.is_gap] gapsizes = [None] * len(gaps) # master for mlg, count in mlg_counts.items(): if count < links: continue g = GapEstimator(cc, agp, ob, mlg) g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \ verbose=verbose) # Merge evidence from this mlg into master assert len(g.gapsizes) == len(gaps) for i, gs in enumerate(gapsizes): gg = g.gapsizes[i] if gs is None: gapsizes[i] = gg elif gg: gapsizes[i] = min(gs, gg) print gapsizes # Modify AGP i = 0 for x in components: if x.is_gap: x.gap_length = gapsizes[i] or minsize x.component_type = 'U' if x.gap_length == 100 else 'N' i += 1 print >> fw, x fw.close() reindex([outagpfile, "--inplace"])
def plot(args): """ %prog plot input.bed seqid Plot the matchings between the reconstructed pseudomolecules and the maps. Two types of visualizations are available in one canvas: 1. Parallel axes, and matching markers are shown in connecting lines; 2. Scatter plot. """ from jcvi.graphics.base import plt, savefig, normalize_axes, \ set2, panel_labels from jcvi.graphics.chromosome import Chromosome, GeneticMap, \ HorizontalChromosome p = OptionParser(plot.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x6") if len(args) != 2: sys.exit(not p.print_help()) inputbed, seqid = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".lifted.bed" agpfile = pf + ".agp" weightsfile = opts.weightsfile links = opts.links function = get_function(opts.distance) cc = Map(bedfile, function) allseqids = cc.seqids mapnames = cc.mapnames weights = Weights(weightsfile, mapnames) assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids) s = Scaffold(seqid, cc) mlgs = [k for k, v in s.mlg_counts.items() if v >= links] mlgsizes = {} for mlg in mlgs: mm = cc.extract_mlg(mlg) mlgsize = max(function(x) for x in mm) mlgsizes[mlg] = mlgsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax1 = fig.add_axes([0, 0, .5, 1]) ax2 = fig.add_axes([.5, 0, .5, 1]) # Find the layout first ystart, ystop = .9, .1 L = Layout(mlgsizes) coords = L.coords tip = .02 marker_pos = {} # Palette colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames)) colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs) rhos = {} # Parallel coordinates for mlg, (x, y1, y2) in coords.items(): mm = cc.extract_mlg(mlg) markers = [(m.accn, function(m)) for m in mm] # exhaustive marker list xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid] mx, my = zip(*xy) rho = spearmanr(mx, my) rhos[mlg] = rho flip = rho < 0 g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip) extra = -3 * tip if x < .5 else 3 * tip ha = "right" if x < .5 else "left" mapname = mlg.split("-")[0] tlg = mlg.replace("_", ".") # Latex does not like underscore char label = "{0} (w={1})".format(tlg, weights[mapname]) ax1.text(x + extra, (y1 + y2) / 2, label, color=colors[mlg], ha=ha, va="center", rotation=90) marker_pos.update(g.marker_pos) agp = AGP(agpfile) agp = [x for x in agp if x.object == seqid] chrsize = max(x.object_end for x in agp) # Pseudomolecules in the center r = ystart - ystop ratio = r / chrsize f = lambda x: (ystart - ratio * x) patchstart = [f(x.object_beg) for x in agp if not x.is_gap] Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2) label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0)) ax1.text(.5, ystart + tip, label, ha="center") scatter_data = defaultdict(list) # Connecting lines for b in s.markers: marker_name = b.accn if marker_name not in marker_pos: continue cx = .5 cy = f(b.pos) mx = coords[b.mlg][0] my = marker_pos[marker_name] extra = -tip if mx < cx else tip extra *= 1.25 # leave boundaries for aesthetic reasons cx += extra mx -= extra ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg]) scatter_data[b.mlg].append((b.pos, function(b))) # Scatter plot, same data as parallel coordinates xstart, xstop = sorted((ystart, ystop)) f = lambda x: (xstart + ratio * x) pp = [x.object_beg for x in agp if not x.is_gap] patchstart = [f(x) for x in pp] HorizontalChromosome(ax2, xstart, xstop, ystop, height=2 * tip, patch=patchstart, lw=2) gap = .03 ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values()) tlgs = [] for mlg, mlgsize in sorted(mlgsizes.items()): height = ratio * mlgsize ystart -= height xx = .5 + xstart / 2 width = r / 2 color = colors[mlg] ax = fig.add_axes([xx, ystart, width, height]) ypos = ystart + height / 2 ystart -= gap sd = scatter_data[mlg] xx, yy = zip(*sd) ax.vlines(pp, 0, mlgsize, colors="beige") ax.plot(xx, yy, ".", color=color) rho = rhos[mlg] ax.text(.5, 1 - .4 * gap / height, r"$\rho$={0:.3f}".format(rho), ha="center", va="top", transform=ax.transAxes, color="gray") tlg = mlg.replace("_", ".") tlgs.append((tlg, ypos, color)) ax.set_xlim(0, chrsize) ax.set_ylim(0, mlgsize) ax.set_xticks([]) while height / len(ax.get_yticks()) < .03 and len( ax.get_yticks()) >= 2: ax.set_yticks(ax.get_yticks()[::2]) # Sparsify the ticks yticklabels = [int(x) for x in ax.get_yticks()] ax.set_yticklabels(yticklabels, family='Helvetica') if rho < 0: ax.invert_yaxis() for i, (tlg, ypos, color) in enumerate(tlgs): ha = "center" if len(tlgs) > 4: ha = "right" if i % 2 else "left" root.text(.5, ypos, tlg, color=color, rotation=90, ha=ha, va="center") if opts.panels: labels = ((.04, .96, 'A'), (.48, .96, 'B')) panel_labels(root, labels) normalize_axes((ax1, ax2, root)) image_name = seqid + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts) plt.close(fig)