def main(args): p = OptionParser(__doc__) p.set_beds() p.set_stripnames() p.add_option( "--tandems_only", dest="tandems_only", action="store_true", default=False, help="only calculate tandems, write .localdup file and exit.", ) p.add_option("--tandem_Nmax", type="int", default=10, help="merge tandem genes within distance [default: %default]") p.add_option( "--cscore", type="float", default=0.7, help="retain hits that have good bitscore. a value of 0.5 means " "keep all values that are 50% or greater of the best hit. " "higher is more stringent [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blastfilter_main(blastfile, p, opts)
def rebuild(args): """ %prog rebuild blocksfile blastfile Rebuild anchors file from pre-built blocks file. """ p = OptionParser(rebuild.__doc__) p.add_option("--header", default=False, action="store_true", help="First line is header [default: %default]") p.add_option("--write_blast", default=False, action="store_true", help="Get blast records of rebuilt anchors [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blocksfile, blastfile = args bk = BlockFile(blocksfile, header=opts.header) fw = open("pairs", "w") for a, b, h in bk.iter_all_pairs(): print >> fw, "\t".join((a, b)) fw.close() if opts.write_blast: AnchorFile("pairs").blast(blastfile, "pairs.blast") fw = open("tracks", "w") for g, col in bk.iter_gene_col(): print >> fw, "\t".join(str(x) for x in (g, col)) fw.close()
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def depth(args): """ %prog depth anchorfile --qbed qbedfile --sbed sbedfile Calculate the depths in the two genomes in comparison, given in --qbed and --sbed. The synteny blocks will be layered on the genomes, and the multiplicity will be summarized to stderr. """ from jcvi.utils.range import range_depth p = OptionParser(depth.__doc__) p.add_option("--depthfile", help="Generate file with gene and depth [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) depthfile = opts.depthfile ac = AnchorFile(anchorfile) qranges = [] sranges = [] blocks = ac.blocks for ib in blocks: q, s, t = zip(*ib) q = [qorder[x] for x in q] s = [sorder[x] for x in s] qrange = (min(q)[0], max(q)[0]) srange = (min(s)[0], max(s)[0]) qranges.append(qrange) sranges.append(srange) if is_self: qranges.append(srange) qgenome = op.basename(qbed.filename).split(".")[0] sgenome = op.basename(sbed.filename).split(".")[0] print >> sys.stderr, "Genome {0} depths:".format(qgenome) ds, details = range_depth(qranges, len(qbed)) if depthfile: fw = open(depthfile, "w") write_details(fw, details, qbed) if is_self: return print >> sys.stderr, "Genome {0} depths:".format(sgenome) ds, details = range_depth(sranges, len(sbed)) if depthfile: write_details(fw, details, sbed) fw.close() logging.debug("Depth written to `{0}`.".format(depthfile))
def layout(args): """ %prog layout query.subject.simple query.seqids subject.seqids Compute optimal seqids order in a second genome, based on seqids on one genome, given the pairwise blocks in .simple format. """ from jcvi.algorithms.ec import GA_setup, GA_run p = OptionParser(layout.__doc__) p.set_beds() p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) simplefile, qseqids, sseqids = args qbed, sbed, qorder, sorder, is_self = check_beds(simplefile, p, opts) qseqids = qseqids.strip().split(",") sseqids = sseqids.strip().split(",") qseqids_ii = dict((s, i) for i, s in enumerate(qseqids)) sseqids_ii = dict((s, i) for i, s in enumerate(sseqids)) blocks = SimpleFile(simplefile).blocks scores = defaultdict(int) for a, b, c, d, score, orientation, hl in blocks: qi, q = qorder[a] si, s = sorder[c] qseqid, sseqid = q.seqid, s.seqid if sseqid not in sseqids: continue scores[sseqids_ii[sseqid], qseqid] += score data = [] for (a, b), score in sorted(scores.items()): if b not in qseqids_ii: continue data.append((qseqids_ii[b], score)) tour = range(len(qseqids)) toolbox = GA_setup(tour) toolbox.register("evaluate", colinear_evaluate_weights, data=data) tour, fitness = GA_run(toolbox, ngen=100, npop=100, cpus=opts.cpus) tour = [qseqids[x] for x in tour] print ",".join(tour)
def mtdotplots(args): """ %prog mtdotplots Mt3.5 Mt4.0 medicago.medicago.lifted.1x1.anchors Plot Mt3.5 and Mt4.0 side-by-side. This is essentially combined from two graphics.dotplot() function calls as panel A and B. """ from jcvi.graphics.dotplot import check_beds, dotplot p = OptionParser(mtdotplots.__doc__) p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", dpi=90) if len(args) != 3: sys.exit(not p.print_help()) a, b, ac = args fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) r1 = fig.add_axes([0, 0, .5, 1]) r2 = fig.add_axes([.5, 0, .5, 1]) a1 = fig.add_axes([.05, .1, .4, .8]) a2 = fig.add_axes([.55, .1, .4, .8]) anchorfile = op.join(a, ac) qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) dotplot(anchorfile, qbed, sbed, fig, r1, a1, is_self=is_self, genomenames="Mt3.5_Mt3.5") opts.qbed = opts.sbed = None anchorfile = op.join(b, ac) qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) dotplot(anchorfile, qbed, sbed, fig, r2, a2, is_self=is_self, genomenames="Mt4.0_Mt4.0") root.text(.03, .95, "A", ha="center", va="center", size=36) root.text(.53, .95, "B", ha="center", va="center", size=36) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() pf = "mtdotplots" image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def movieframe(args): """ %prog movieframe tour test.clm contigs.ref.anchors Draw heatmap and synteny in the same plot. """ p = OptionParser(movieframe.__doc__) p.add_option("--label", help="Figure title") p.set_beds() p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=120) if len(args) != 3: sys.exit(not p.print_help()) tour, clmfile, anchorsfile = args tour = tour.split(",") image_name = opts.outfile or ("movieframe." + iopts.format) label = opts.label or op.basename(image_name).rsplit(".", 1)[0] clm = CLMFile(clmfile) totalbins, bins, breaks = make_bins(tour, clm.tig_to_size) M = read_clm(clm, totalbins, bins) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # whole canvas ax1 = fig.add_axes([.05, .1, .4, .8]) # heatmap ax2 = fig.add_axes([.55, .1, .4, .8]) # dot plot ax2_root = fig.add_axes([.5, 0, .5, 1]) # dot plot canvas # Left axis: heatmap plot_heatmap(ax1, M, breaks, iopts) # Right axis: synteny qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts, sorted=False) dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="") root.text(.5, .98, clm.name, color="g", ha="center", va="center") root.text(.5, .95, label, color="darkslategray", ha="center", va="center") normalize_axes(root) savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def offdiag(args): """ %prog offdiag diploid.napus.1x1.lifted.anchors Find gene pairs that are off diagnoal. "Off diagonal" are the pairs that are not on the orthologous chromosomes. For example, napus chrA01 and brapa A01. """ p = OptionParser(offdiag.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) fp = open(anchorsfile) pf = "-".join(anchorsfile.split(".")[:2]) header = "Block-id|Napus|Diploid|Napus-chr|Diploid-chr|RBH?".split("|") print "\t".join(header) i = -1 for row in fp: if row[0] == '#': i += 1 continue q, s, score = row.split() rbh = 'no' if score[-1] == 'L' else 'yes' qi, qq = qorder[q] si, ss = sorder[s] oqseqid = qseqid = qq.seqid osseqid = sseqid = ss.seqid sseqid = sseqid.split("_")[0][-3:] if qseqid[0] == 'A': qseqid = qseqid[-3:] # A09 => A09 elif qseqid[0] == 'C': qseqid = 'C0' + qseqid[-1] # C9 => C09 else: continue if qseqid == sseqid or sseqid[-2:] == 'nn': continue block_id = pf + "-block-{0}".format(i) print "\t".join((block_id, q, s, oqseqid, osseqid, rbh))
def collinear(args): """ %prog collinear a.b.anchors Reduce synteny blocks to strictly collinear, use dynamic programming in a procedure similar to DAGchainer. """ p = OptionParser(collinear.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors" fw = open(newanchorfile, "w") blocks = af.blocks for block in blocks: print >> fw, "#" * 3 iblock = [] for q, s, score in block: qi, q = qorder[q] si, s = sorder[s] score = int(long(score)) iblock.append([qi, si, score]) block = get_collinear(iblock) for q, s, score in block: q = qbed[q].accn s = sbed[s].accn print >> fw, "\t".join((q, s, str(score))) fw.close()
def collinear(args): """ %prog collinear a.b.anchors Reduce synteny blocks to strictly collinear, use dynamic programming in a procedure similar to DAGchainer. """ p = OptionParser(collinear.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors" fw = open(newanchorfile, "w") blocks = af.blocks for block in blocks: print("#" * 3, file=fw) iblock = [] for q, s, score in block: qi, q = qorder[q] si, s = sorder[s] score = int(long(score)) iblock.append([qi, si, score]) block = get_collinear(iblock) for q, s, score in block: q = qbed[q].accn s = sbed[s].accn print("\t".join((q, s, str(score))), file=fw) fw.close()
def mergechrom(args): """ %prog mergechrom a.b.anchors merge synteny blocks on the same chromosome """ p = OptionParser(mergechrom.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors" fw = open(newanchorfile, "w") qchrom_dic = dict((b.accn,b.seqid) for b in qbed) schrom_dic = dict((b.accn,b.seqid) for b in sbed) block_dic = dict() blocks = af.blocks for (i,block) in enumerate(blocks): q, s, score = block[0] qchrom, schrom = qchrom_dic[q], schrom_dic[s] k = "%s_%s" % (qchrom, schrom) if k not in block_dic: block_dic[k] = [] block_dic[k].append(i) for (k, idxs) in block_dic.items(): print("#" * 3, file=fw) for i in idxs: for q, s, score in blocks[i]: print("\t".join((q, s, str(score))), file=fw) fw.close() print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
def mergechrom(args): """ %prog mergechrom a.b.anchors merge synteny blocks on the same chromosome """ p = OptionParser(mergechrom.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors" fw = open(newanchorfile, "w") qchrom_dic = dict((b.accn, b.seqid) for b in qbed) schrom_dic = dict((b.accn, b.seqid) for b in sbed) block_dic = dict() blocks = af.blocks for (i, block) in enumerate(blocks): q, s, score = block[0] qchrom, schrom = qchrom_dic[q], schrom_dic[s] k = "%s_%s" % (qchrom, schrom) if k not in block_dic: block_dic[k] = [] block_dic[k].append(i) for (k, idxs) in block_dic.items(): print("#" * 3, file=fw) for i in idxs: for q, s, score in blocks[i]: print("\t".join((q, s, str(score))), file=fw) fw.close() print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
def main(args): p = OptionParser(__doc__) p.set_beds() p.set_stripnames() p.add_option("--tandems_only", dest="tandems_only", action="store_true", default=False, help="only calculate tandems, write .localdup file and exit.") p.add_option("--tandem_Nmax", type="int", default=10, help="merge tandem genes within distance [default: %default]") p.add_option("--cscore", type="float", default=.7, help="retain hits that have good bitscore. a value of 0.5 means " "keep all values that are 50% or greater of the best hit. " "higher is more stringent [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blastfilter_main(blastfile, p, opts)
def rebuild(args): """ %prog rebuild blocksfile blastfile Rebuild anchors file from pre-built blocks file. """ p = OptionParser(rebuild.__doc__) p.add_option("--header", default=False, action="store_true", help="First line is header [default: %default]") p.add_option( "--write_blast", default=False, action="store_true", help="Get blast records of rebuilt anchors [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blocksfile, blastfile = args bk = BlockFile(blocksfile, header=opts.header) fw = open("pairs", "w") for a, b, h in bk.iter_all_pairs(): print >> fw, "\t".join((a, b)) fw.close() if opts.write_blast: AnchorFile("pairs").blast(blastfile, "pairs.blast") fw = open("tracks", "w") for g, col in bk.iter_gene_col(): print >> fw, "\t".join(str(x) for x in (g, col)) fw.close()
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = zip(*block)[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) p.set_beds() p.add_option("--cutoff", default=.3, type="float", help="The clustering cutoff to call similar [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = - log(pvalue_cutoff) significant = [] for i in xrange(m): for j in xrange(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print("|".join(a), "|".join(b), score) logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\ format(len(significant), pvalue_cutoff)) return significant
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[ tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label ]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option( "--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan)**.5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [ block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+" ] bargs = [ block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation ] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [ astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id ] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format( i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) p.set_beds() p.add_option( "--cutoff", default=0.3, type="float", help="The clustering cutoff to call similar", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = -log(pvalue_cutoff) significant = [] for i in range(m): for j in range(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print("|".join(a), "|".join(b), score) logging.debug( "Collected {0} PAR comparisons significant at (P < {1}).".format( len(significant), pvalue_cutoff ) ) return significant
logging.debug("Self comparisons, mirror ignored") else: batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True) if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close() else: fw.close() if __name__ == '__main__': p = OptionParser(__doc__) p.set_beds() p.set_stripnames() p.set_outfile() coge_group = OptionGroup(p, "CoGe-specific options") coge_group.add_option("--sqlite", help="Write sqlite database") coge_group.add_option("--qnote", default="null", help="Query dataset group id") coge_group.add_option("--snote", default="null", help="Subject dataset group id") params_group = OptionGroup(p, "Synteny parameters") params_group.add_option("--window", type="int",
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def main(args): p = OptionParser(__doc__) p.set_beds() p.add_option( "--quota", default="1:1", help="`quota mapping` procedure -- screen blocks to constrain mapping" " (useful for orthology), " "put in the format like (#subgenomes expected for genome X):" "(#subgenomes expected for genome Y) " "[default: %default]") p.add_option("--Nm", dest="Nmax", type="int", default=10, help="distance cutoff to tolerate two blocks that are " "slightly overlapping (cutoff for `quota mapping`) " "[default: %default units (gene or bp dist)]") supported_solvers = ("SCIP", "GLPK") p.add_option( "--self", dest="self_match", action="store_true", default=False, help="you might turn this on when screening paralogous blocks, " "esp. if you have reduced mirrored blocks into non-redundant set") p.add_option("--solver", default="SCIP", choices=supported_solvers, help="use MIP solver [default: %default]") p.set_verbose(help="Show verbose solver output") p.add_option("--screen", default=False, action="store_true", help="generate new anchors file [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) qa_file, = args qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts) # sanity check for the quota if opts.quota: try: qa, qb = opts.quota.split(":") qa, qb = int(qa), int(qb) except: print("quota string should be the form x:x (2:4, 1:3, etc.)", file=sys.stderr) sys.exit(1) if opts.self_match and qa != qb: raise Exception("when comparing genome to itself, " "quota must be the same number " "(like 1:1, 2:2) you have %s" % opts.quota) quota = (qa, qb) self_match = opts.self_match clusters = read_clusters(qa_file, qorder, sorder) for cluster in clusters: assert len(cluster) > 0 # below runs `quota mapping` work_dir = op.join(op.dirname(op.abspath(qa_file)), "work") selected_ids = solve_lp(clusters, quota, work_dir=work_dir, Nmax=opts.Nmax, self_match=self_match, solver=opts.solver, verbose=opts.verbose) logging.debug("Selected {0} blocks.".format(len(selected_ids))) prefix = qa_file.rsplit(".", 1)[0] suffix = "{0}x{1}".format(qa, qb) outfile = ".".join((prefix, suffix)) fw = must_open(outfile, "w") print(",".join(str(x) for x in selected_ids), file=fw) fw.close() logging.debug("Screened blocks ids written to `{0}`.".format(outfile)) if opts.screen: from jcvi.compara.synteny import screen new_qa_file = ".".join((prefix, suffix, "anchors")) largs = [qa_file, new_qa_file, "--ids", outfile] if opts.qbed and opts.sbed: largs += ["--qbed={0}".format(opts.qbed)] largs += ["--sbed={0}".format(opts.sbed)] screen(largs)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--quota", default="1:1", help="`quota mapping` procedure -- screen blocks to constrain mapping"\ " (useful for orthology), "\ "put in the format like (#subgenomes expected for genome X):"\ "(#subgenomes expected for genome Y) "\ "[default: %default]") p.add_option("--Nm", dest="Nmax", type="int", default=10, help="distance cutoff to tolerate two blocks that are "\ "slightly overlapping (cutoff for `quota mapping`) "\ "[default: %default units (gene or bp dist)]") supported_solvers = ("SCIP", "GLPK") p.add_option("--self", dest="self_match", action="store_true", default=False, help="you might turn this on when screening paralogous blocks, "\ "esp. if you have reduced mirrored blocks into non-redundant set") p.add_option("--solver", default="SCIP", choices=supported_solvers, help="use MIP solver [default: %default]") p.add_option("--verbose", action="store_true", default=False, help="show verbose solver output") p.add_option("--screen", default=False, action="store_true", help="generate new anchors file [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) qa_file, = args qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts) # sanity check for the quota if opts.quota: try: qa, qb = opts.quota.split(":") qa, qb = int(qa), int(qb) except: print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)" sys.exit(1) if opts.self_match and qa != qb: raise Exception, "when comparing genome to itself, " \ "quota must be the same number " \ "(like 1:1, 2:2) you have %s" % opts.quota quota = (qa, qb) self_match = opts.self_match clusters = read_clusters(qa_file, qorder, sorder) for cluster in clusters: assert len(cluster) > 0 # below runs `quota mapping` work_dir = op.join(op.dirname(op.abspath(qa_file)), "work") selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \ Nmax=opts.Nmax, self_match=self_match, \ solver=opts.solver, verbose=opts.verbose) logging.debug("Selected {0} blocks.".format(len(selected_ids))) prefix = qa_file.rsplit(".", 1)[0] suffix = "{0}x{1}".format(qa, qb) outfile = ".".join((prefix, suffix)) fw = must_open(outfile, "w") print >> fw, ",".join(str(x) for x in selected_ids) fw.close() logging.debug("Screened blocks ids written to `{0}`.".format(outfile)) if opts.screen: from jcvi.compara.synteny import screen new_qa_file = ".".join((prefix, suffix, "anchors")) largs = [qa_file, new_qa_file, "--ids", outfile] if opts.qbed and opts.sbed: largs += ["--qbed={0}".format(opts.qbed)] largs += ["--sbed={0}".format(opts.sbed)] screen(largs)
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = { (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), } def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def dotplot_main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=2, help="Maximum value in the colormap [default: %default]") p.add_option("--genomenames", type="string", default=None, help="genome names for labeling axes in the form of qname_sname, " \ "eg. \"Vitis vinifera_Oryza sativa\"") p.add_option("--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot [default: %default]") p.add_option("--minfont", type="int", default=4, help="Do not render labels with size smaller than") p.add_option("--colormap", help="Two column file, block id to color mapping [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the seqids along the axes") p.add_option("--nosep", default=False, action="store_true", help="Do not add contig lines") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--skipempty", default=False, action="store_true", help="Skip seqids that do not have matches") p.add_option("--title", help="Title of the dot plot") p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args cmaptext = opts.cmaptext if anchorfile.endswith(".ks"): from jcvi.apps.ks import KsFile logging.debug("Anchors contain Ks values") cmaptext = cmaptext or "*Ks* values" anchorksfile = anchorfile + ".anchors" if need_update(anchorfile, anchorksfile): ksfile = KsFile(anchorfile) ksfile.print_to_anchors(anchorksfile) anchorfile = anchorksfile qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts, sorted=(not opts.nosort)) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self, synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap, genomenames=opts.genomenames, sample_number=opts.sample_number, minfont=opts.minfont, palette=palette, sep=(not opts.nosep), title=opts.title, stdpf=(not opts.nostdpf)) image_name = opts.outfile or \ (op.splitext(anchorfile)[0] + "." + opts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def depth(args): """ %prog depth anchorfile --qbed qbedfile --sbed sbedfile Calculate the depths in the two genomes in comparison, given in --qbed and --sbed. The synteny blocks will be layered on the genomes, and the multiplicity will be summarized to stderr. """ from jcvi.utils.range import range_depth p = OptionParser(depth.__doc__) p.add_option("--depthfile", help="Generate file with gene and depth [default: %default]") p.add_option("--histogram", default=False, action="store_true", help="Plot histograms in PDF") p.add_option("--xmax", type="int", help="x-axis maximum to display in plot") p.add_option("--title", default=None, help="Title to display in plot") p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) depthfile = opts.depthfile ac = AnchorFile(anchorfile) qranges = [] sranges = [] blocks = ac.blocks for ib in blocks: q, s, t = zip(*ib) q = [qorder[x] for x in q] s = [sorder[x] for x in s] qrange = (min(q)[0], max(q)[0]) srange = (min(s)[0], max(s)[0]) qranges.append(qrange) sranges.append(srange) if is_self: qranges.append(srange) qgenome = op.basename(qbed.filename).split(".")[0] sgenome = op.basename(sbed.filename).split(".")[0] qtag = "Genome {0} depths".format(qgenome) print >> sys.stderr, "{}:".format(qtag) dsq, details = range_depth(qranges, len(qbed)) if depthfile: fw = open(depthfile, "w") write_details(fw, details, qbed) if is_self: return stag = "Genome {0} depths".format(sgenome) print >> sys.stderr, "{}:".format(stag) dss, details = range_depth(sranges, len(sbed)) if depthfile: write_details(fw, details, sbed) fw.close() logging.debug("Depth written to `{0}`.".format(depthfile)) if not opts.histogram: return from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes # Plot two histograms one for query genome, one for subject genome plt.figure(1, (6, 3)) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys())) if opts.quota: speak, qpeak = opts.quota.split(":") qpeak, speak = int(qpeak), int(speak) else: qpeak = find_peak(dsq) speak = find_peak(dss) qtag = "# of {} blocks per {} gene".format(sgenome, qgenome) stag = "# of {} blocks per {} gene".format(qgenome, sgenome) quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome", highlight=range(1, speak + 1)) quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None, highlight=range(1, qpeak + 1)) title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\ .format(qgenome, sgenome, speak, qpeak) root = f.add_axes([0, 0, 1, 1]) vs, pattern = title.split('\n') root.text(.5, .97, vs, ha="center", va="center", color="darkslategray") root.text(.5, .925, pattern, ha="center", va="center", color="tomato", size=16) print >> sys.stderr, title normalize_axes(root) pf = anchorfile.rsplit(".", 1)[0] + ".depth" image_name = pf + ".pdf" savefig(image_name)
logging.debug("Self comparisons, mirror ignored") else: batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True) if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close() else: fw.close() if __name__ == '__main__': p = OptionParser(__doc__) p.set_beds() p.set_stripnames() p.set_outfile() coge_group = OptionGroup(p, "CoGe-specific options") coge_group.add_option("--sqlite", help="Write sqlite database") coge_group.add_option("--qnote", default="null", help="Query dataset group id") coge_group.add_option("--snote", default="null", help="Subject dataset group id") params_group = OptionGroup(p, "Synteny parameters") params_group.add_option("--window", type="int", default=40, help="Synteny window size") params_group.add_option("--cutoff", type="float", default=.1, help="Minimum number of anchors to call synteny")
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option("--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan) ** .5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+"] bargs = [block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option( "--minsize", default=10, type="int", help="Only segment using blocks >= size" ) p.add_option( "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary" ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = list(zip(*block))[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in range(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def screen(args): """ %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options] Extract subset of blocks from anchorfile. Provide several options: 1. Option --ids: a file with IDs, 0-based, comma separated, all in one line. 2. Option --seqids: only allow seqids in this file. 3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05". 4. Option --minspan: remove blocks with less span than this. 5. Option --minsize: remove blocks with less number of anchors than this. """ p = OptionParser(screen.__doc__) p.set_beds() p.add_option("--ids", help="File with block IDs (0-based) [default: %default]") p.add_option("--seqids", help="File with seqids [default: %default]") p.add_option("--seqpairs", help="File with seqpairs [default: %default]") p.add_option("--nointra", action="store_true", help="Remove intra-chromosomal blocks [default: %default]") p.add_option("--minspan", default=0, type="int", help="Only blocks with span >= [default: %default]") p.add_option("--minsize", default=0, type="int", help="Only blocks with anchors >= [default: %default]") p.add_option("--simple", action="store_true", help="Write simple anchorfile with block ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) anchorfile, newanchorfile = args ac = AnchorFile(anchorfile) idsfile = opts.ids seqidsfile = opts.seqids seqpairsfile = opts.seqpairs minspan = opts.minspan minsize = opts.minsize osimple = opts.simple nointra = opts.nointra ids, seqids, seqpairs = None, None, None if idsfile: ids = SetFile(idsfile, delimiter=',') ids = set(int(x) for x in ids) if seqidsfile: seqids = SetFile(seqidsfile, delimiter=',') if seqpairsfile: fp = open(seqpairsfile) seqpairs = set() for row in fp: a, b = row.strip().split(",") seqpairs.add((a, b)) seqpairs.add((b, a)) qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) blocks = ac.blocks selected = 0 fw = open(newanchorfile, "w") for i, block in enumerate(blocks): if ids and i not in ids: continue a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) aspan = max(ia) - min(ia) + 1 bspan = max(ib) - min(ib) + 1 aseqid = oa[0].seqid bseqid = ob[0].seqid if seqids: if (aseqid not in seqids) or (bseqid not in seqids): continue if seqpairs: if (aseqid, bseqid) not in seqpairs: continue if nointra and aseqid == bseqid: continue if minsize: if len(block) < minsize: continue if minspan: if aspan < minspan or bspan < minspan: continue selected += 1 print >> fw, "###" for line in block: print >> fw, "\t".join(line) fw.close() if osimple: simple([newanchorfile, "--noheader", \ "--qbed=" + qbed.filename, "--sbed=" + sbed.filename]) logging.debug("Before: {0} blocks, After: {1} blocks".\ format(len(blocks), selected))
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format( len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option( "--switch", default=False, action="store_true", help="Switch reference and aligned map elements", ) p.add_option( "--scale", type="float", help="Scale the aligned map distance by factor" ) p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile,) = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError( "`{0}` is on `{1}` with no number to extract".format(saccn, sseqid) ) bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart)) ) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def screen(args): """ %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options] Extract subset of blocks from anchorfile. Provide several options: 1. Option --ids: a file with IDs, 0-based, comma separated, all in one line. 2. Option --seqids: only allow seqids in this file. 3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05". 4. Option --minspan: remove blocks with less span than this. 5. Option --minsize: remove blocks with less number of anchors than this. """ p = OptionParser(screen.__doc__) p.set_beds() p.add_option("--ids", help="File with block IDs (0-based) [default: %default]") p.add_option("--seqids", help="File with seqids [default: %default]") p.add_option("--seqpairs", help="File with seqpairs [default: %default]") p.add_option("--nointra", action="store_true", help="Remove intra-chromosomal blocks [default: %default]") p.add_option("--minspan", default=0, type="int", help="Only blocks with span >= [default: %default]") p.add_option("--minsize", default=0, type="int", help="Only blocks with anchors >= [default: %default]") p.add_option( "--simple", action="store_true", help="Write simple anchorfile with block ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) anchorfile, newanchorfile = args ac = AnchorFile(anchorfile) idsfile = opts.ids seqidsfile = opts.seqids seqpairsfile = opts.seqpairs minspan = opts.minspan minsize = opts.minsize osimple = opts.simple nointra = opts.nointra ids, seqids, seqpairs = None, None, None if idsfile: ids = SetFile(idsfile, delimiter=',') ids = set(int(x) for x in ids) if seqidsfile: seqids = SetFile(seqidsfile, delimiter=',') if seqpairsfile: fp = open(seqpairsfile) seqpairs = set() for row in fp: a, b = row.strip().split(",") seqpairs.add((a, b)) seqpairs.add((b, a)) qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) blocks = ac.blocks selected = 0 fw = open(newanchorfile, "w") for i, block in enumerate(blocks): if ids and i not in ids: continue a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) aspan = max(ia) - min(ia) + 1 bspan = max(ib) - min(ib) + 1 aseqid = oa[0].seqid bseqid = ob[0].seqid if seqids: if (aseqid not in seqids) or (bseqid not in seqids): continue if seqpairs: if (aseqid, bseqid) not in seqpairs: continue if nointra and aseqid == bseqid: continue if minsize: if len(block) < minsize: continue if minspan: if aspan < minspan or bspan < minspan: continue selected += 1 print >> fw, "###" for line in block: print >> fw, "\t".join(line) fw.close() if osimple: simple([newanchorfile, "--noheader", \ "--qbed=" + qbed.filename, "--sbed=" + sbed.filename]) logging.debug("Before: {0} blocks, After: {1} blocks".\ format(len(blocks), selected))
def depth(args): """ %prog depth anchorfile --qbed qbedfile --sbed sbedfile Calculate the depths in the two genomes in comparison, given in --qbed and --sbed. The synteny blocks will be layered on the genomes, and the multiplicity will be summarized to stderr. """ from jcvi.utils.range import range_depth p = OptionParser(depth.__doc__) p.add_option("--depthfile", help="Generate file with gene and depth [default: %default]") p.add_option("--histogram", default=False, action="store_true", help="Plot histograms in PDF") p.add_option("--xmax", type="int", help="x-axis maximum to display in plot") p.add_option("--title", default=None, help="Title to display in plot") p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) depthfile = opts.depthfile ac = AnchorFile(anchorfile) qranges = [] sranges = [] blocks = ac.blocks for ib in blocks: q, s, t = zip(*ib) q = [qorder[x] for x in q] s = [sorder[x] for x in s] qrange = (min(q)[0], max(q)[0]) srange = (min(s)[0], max(s)[0]) qranges.append(qrange) sranges.append(srange) if is_self: qranges.append(srange) qgenome = op.basename(qbed.filename).split(".")[0] sgenome = op.basename(sbed.filename).split(".")[0] qtag = "Genome {0} depths".format(qgenome) print("{}:".format(qtag), file=sys.stderr) dsq, details = range_depth(qranges, len(qbed)) if depthfile: fw = open(depthfile, "w") write_details(fw, details, qbed) if is_self: return stag = "Genome {0} depths".format(sgenome) print("{}:".format(stag), file=sys.stderr) dss, details = range_depth(sranges, len(sbed)) if depthfile: write_details(fw, details, sbed) fw.close() logging.debug("Depth written to `{0}`.".format(depthfile)) if not opts.histogram: return from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes # Plot two histograms one for query genome, one for subject genome plt.figure(1, (6, 3)) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys())) if opts.quota: speak, qpeak = opts.quota.split(":") qpeak, speak = int(qpeak), int(speak) else: qpeak = find_peak(dsq) speak = find_peak(dss) qtag = "# of {} blocks per {} gene".format(sgenome, qgenome) stag = "# of {} blocks per {} gene".format(qgenome, sgenome) quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome", highlight=range(1, speak + 1)) quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None, highlight=range(1, qpeak + 1)) title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\ .format(qgenome, sgenome, speak, qpeak) root = f.add_axes([0, 0, 1, 1]) vs, pattern = title.split('\n') root.text(.5, .97, vs, ha="center", va="center", color="darkslategray") root.text(.5, .925, pattern, ha="center", va="center", color="tomato", size=16) print(title, file=sys.stderr) normalize_axes(root) pf = anchorfile.rsplit(".", 1)[0] + ".depth" image_name = pf + ".pdf" savefig(image_name)