def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join(str(x) for x in ( chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id )) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp): agp = AGP(agpfile) scaffolds_seen = set(x.component_id for x in agp) sizes = Sizes(scaffolds).mapping fwagp = must_open(unplaced_agp, "w") for s in sorted(sizes.keys()): if s in scaffolds_seen: continue order_to_agp(s, [(s, "?")], sizes, fwagp) logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def neighbor(args): """ %prog neighbor agpfile componentID Check overlaps of a particular component in agpfile. """ p = OptionParser(neighbor.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, componentID = args fastadir = "fasta" cmd = "grep" cmd += " --color -C2 {0} {1}".format(componentID, agpfile) sh(cmd) agp = AGP(agpfile) aorder = agp.order if not componentID in aorder: print( "Record {0} not present in `{1}`.".format(componentID, agpfile), file=sys.stderr, ) return i, c = aorder[componentID] north, south = agp.getNorthSouthClone(i) if not north.isCloneGap: ar = [north.component_id, componentID, "--dir=" + fastadir] if north.orientation == "-": ar += ["--qreverse"] overlap(ar) if not south.isCloneGap: ar = [componentID, south.component_id, "--dir=" + fastadir] if c.orientation == "-": ar += ["--qreverse"] overlap(ar)
def plotall(xargs): """ %prog plotall input.bed Plot the matchings between the reconstructed pseudomolecules and the maps. This command will plot each reconstructed object (non-singleton). """ p = OptionParser(plotall.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(xargs, figsize="10x6") if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".agp" agp = AGP(agpfile) objects = [ob for ob, lines in agp.iter_object() if len(lines) > 1] for seqid in sorted(objects): plot(xargs + [seqid])
def neighbor(args): """ %prog neighbor agpfile componentID Check overlaps of a particular component in agpfile. """ p = OptionParser(neighbor.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, componentID = args fastadir = "fasta" cmd = "grep" cmd += " --color -C2 {0} {1}".format(componentID, agpfile) sh(cmd) agp = AGP(agpfile) aorder = agp.order if not componentID in aorder: print >> sys.stderr, "Record {0} not present in `{1}`."\ .format(componentID, agpfile) return i, c = aorder[componentID] north, south = agp.getNorthSouthClone(i) if not north.isCloneGap: ar = [north.component_id, componentID, "--dir=" + fastadir] if north.orientation == '-': ar += ["--qreverse"] overlap(ar) if not south.isCloneGap: ar = [componentID, south.component_id, "--dir=" + fastadir] if c.orientation == '-': ar += ["--qreverse"] overlap(ar)
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length [default: %default]") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".\ format(len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print >> sys.stderr, "Remove {0}".format(c) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def annotate(args): """ %prog annotate agpfile gaps.linkage.bed assembly.fasta Annotate AGP file with linkage info of `paired-end` or `map`. File `gaps.linkage.bed` is generated by assembly.gaps.estimate(). """ from jcvi.formats.agp import AGP, bed, tidy p = OptionParser(annotate.__doc__) p.add_option("--minsize", default=200, help="Smallest component size [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, linkagebed, assemblyfasta = args linkagebed = Bed(linkagebed) spannedgaps = set() for b in linkagebed: score = int(b.score) if score == 0: spannedgaps.add((b.accn, b.start, b.end)) agp = AGP(agpfile) newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp" newagp = open(newagpfile, "w") contig_id = 0 minsize = opts.minsize for a in agp: if not a.is_gap: cs = a.component_span if cs < minsize: a.is_gap = True a.component_type = "N" a.gap_length = cs a.gap_type = "scaffold" a.linkage = "yes" a.linkage_evidence = [] else: contig_id += 1 a.component_id = "contig{0:04d}".format(contig_id) a.component_beg = 1 a.component_end = cs a.component_type = "W" print >> newagp, a continue gapinfo = (a.object, a.object_beg, a.object_end) gaplen = a.gap_length if gaplen == 100 and gapinfo not in spannedgaps: a.component_type = "U" tag = "map" else: tag = "paired-ends" a.linkage_evidence.append(tag) print >> newagp, a newagp.close() logging.debug("Annotated AGP written to `{0}`.".format(newagpfile)) contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed" bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed]) contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True) tidy([newagpfile, contigfasta])
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps([scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >> sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >> sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >> sys.stderr, "* Does not allow backward" \ " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >> sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, \ delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, \ delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, \ delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = { "single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median }[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def plot(args): """ %prog plot input.bed seqid Plot the matchings between the reconstructed pseudomolecules and the maps. Two types of visualizations are available in one canvas: 1. Parallel axes, and matching markers are shown in connecting lines; 2. Scatter plot. """ from jcvi.graphics.base import plt, savefig, normalize_axes, \ set2, panel_labels from jcvi.graphics.chromosome import Chromosome, GeneticMap, \ HorizontalChromosome p = OptionParser(plot.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x6") if len(args) != 2: sys.exit(not p.print_help()) inputbed, seqid = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".lifted.bed" agpfile = pf + ".agp" weightsfile = opts.weightsfile links = opts.links function = get_function(opts.distance) cc = Map(bedfile, function) allseqids = cc.seqids mapnames = cc.mapnames weights = Weights(weightsfile, mapnames) assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids) s = Scaffold(seqid, cc) mlgs = [k for k, v in s.mlg_counts.items() if v >= links] mlgsizes = {} for mlg in mlgs: mm = cc.extract_mlg(mlg) mlgsize = max(function(x) for x in mm) mlgsizes[mlg] = mlgsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax1 = fig.add_axes([0, 0, .5, 1]) ax2 = fig.add_axes([.5, 0, .5, 1]) # Find the layout first ystart, ystop = .9, .1 L = Layout(mlgsizes) coords = L.coords tip = .02 marker_pos = {} # Palette colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames)) colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs) rhos = {} # Parallel coordinates for mlg, (x, y1, y2) in coords.items(): mm = cc.extract_mlg(mlg) markers = [(m.accn, function(m)) for m in mm] # exhaustive marker list xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid] mx, my = zip(*xy) rho = spearmanr(mx, my) rhos[mlg] = rho flip = rho < 0 g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip) extra = -3 * tip if x < .5 else 3 * tip ha = "right" if x < .5 else "left" mapname = mlg.split("-")[0] tlg = mlg.replace("_", ".") # Latex does not like underscore char label = "{0} (w={1})".format(tlg, weights[mapname]) ax1.text(x + extra, (y1 + y2) / 2, label, color=colors[mlg], ha=ha, va="center", rotation=90) marker_pos.update(g.marker_pos) agp = AGP(agpfile) agp = [x for x in agp if x.object == seqid] chrsize = max(x.object_end for x in agp) # Pseudomolecules in the center r = ystart - ystop ratio = r / chrsize f = lambda x: (ystart - ratio * x) patchstart = [f(x.object_beg) for x in agp if not x.is_gap] Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2) label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0)) ax1.text(.5, ystart + tip, label, ha="center") scatter_data = defaultdict(list) # Connecting lines for b in s.markers: marker_name = b.accn if marker_name not in marker_pos: continue cx = .5 cy = f(b.pos) mx = coords[b.mlg][0] my = marker_pos[marker_name] extra = -tip if mx < cx else tip extra *= 1.25 # leave boundaries for aesthetic reasons cx += extra mx -= extra ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg]) scatter_data[b.mlg].append((b.pos, function(b))) # Scatter plot, same data as parallel coordinates xstart, xstop = sorted((ystart, ystop)) f = lambda x: (xstart + ratio * x) pp = [x.object_beg for x in agp if not x.is_gap] patchstart = [f(x) for x in pp] HorizontalChromosome(ax2, xstart, xstop, ystop, height=2 * tip, patch=patchstart, lw=2) gap = .03 ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values()) tlgs = [] for mlg, mlgsize in sorted(mlgsizes.items()): height = ratio * mlgsize ystart -= height xx = .5 + xstart / 2 width = r / 2 color = colors[mlg] ax = fig.add_axes([xx, ystart, width, height]) ypos = ystart + height / 2 ystart -= gap sd = scatter_data[mlg] xx, yy = zip(*sd) ax.vlines(pp, 0, mlgsize, colors="beige") ax.plot(xx, yy, ".", color=color) rho = rhos[mlg] ax.text(.5, 1 - .4 * gap / height, r"$\rho$={0:.3f}".format(rho), ha="center", va="top", transform=ax.transAxes, color="gray") tlg = mlg.replace("_", ".") tlgs.append((tlg, ypos, color)) ax.set_xlim(0, chrsize) ax.set_ylim(0, mlgsize) ax.set_xticks([]) while height / len(ax.get_yticks()) < .03 and len( ax.get_yticks()) >= 2: ax.set_yticks(ax.get_yticks()[::2]) # Sparsify the ticks yticklabels = [int(x) for x in ax.get_yticks()] ax.set_yticklabels(yticklabels, family='Helvetica') if rho < 0: ax.invert_yaxis() for i, (tlg, ypos, color) in enumerate(tlgs): ha = "center" if len(tlgs) > 4: ha = "right" if i % 2 else "left" root.text(.5, ypos, tlg, color=color, rotation=90, ha=ha, va="center") if opts.panels: labels = ((.04, .96, 'A'), (.48, .96, 'B')) panel_labels(root, labels) normalize_axes((ax1, ax2, root)) image_name = seqid + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts) plt.close(fig)
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-b", "--bedfile", help=SUPPRESS_HELP) p.add_option("-s", "--fastafile", help=SUPPRESS_HELP) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args inputbed = opts.bedfile or inputbed fastafile = opts.fastafile or fastafile pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = {"single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median}[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def estimategaps(args): """ %prog estimategaps input.bed Estimate sizes of inter-scaffold gaps. The AGP file generated by path() command has unknown gap sizes with a generic number of Ns (often 100 Ns). The AGP file `input.chr.agp` will be modified in-place. """ p = OptionParser(estimategaps.__doc__) p.add_option("--minsize", default=100, type="int", help="Minimum gap size") p.add_option("--maxsize", default=500000, type="int", help="Maximum gap size") p.add_option("--links", default=10, type="int", help="Only use linkage grounds with matchings more than") p.set_verbose(help="Print details for each gap calculation") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".chr.agp" bedfile = pf + ".lifted.bed" cc = Map(bedfile, scaffold_info=True) agp = AGP(agpfile) minsize, maxsize = opts.minsize, opts.maxsize links = opts.links verbose = opts.verbose outagpfile = pf + ".estimategaps.agp" fw = must_open(outagpfile, "w") for ob, components in agp.iter_object(): components = list(components) s = Scaffold(ob, cc) mlg_counts = s.mlg_counts gaps = [x for x in components if x.is_gap] gapsizes = [None] * len(gaps) # master for mlg, count in mlg_counts.items(): if count < links: continue g = GapEstimator(cc, agp, ob, mlg) g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \ verbose=verbose) # Merge evidence from this mlg into master assert len(g.gapsizes) == len(gaps) for i, gs in enumerate(gapsizes): gg = g.gapsizes[i] if gs is None: gapsizes[i] = gg elif gg: gapsizes[i] = min(gs, gg) print gapsizes # Modify AGP i = 0 for x in components: if x.is_gap: x.gap_length = gapsizes[i] or minsize x.component_type = 'U' if x.gap_length == 100 else 'N' i += 1 print >> fw, x fw.close() reindex([outagpfile, "--inplace"])