def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}" .format(len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk G = BiGraph() for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) #add_bed_to_graph(G, bed, families) print_edges(G, bed, families)
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}".format( len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) print_edges(bed, families)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def from_block_orientation(cls, anchorfile, qbed, sbed, forward_color="#e7298a", reverse_color="#3690c0"): """Generate a palette which contains mapping from block_id (1-based) to colors. Args: anchorfile (str): Path to the .anchors file qbed (BedFile): Query BED sbed (BedFile): Subject BED forward_color (str, optional): Color of forward block. Defaults to "#e7298a". reverse_color (str, optional): Color of reverse block. Defaults to "#3690c0". """ ac = AnchorFile(anchorfile) blocks = ac.blocks palette = {} qorder = qbed.order sorder = sbed.order for i, block in enumerate(blocks): block_id = i + 1 a, b, _ = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, _ = zip(*a) ib, _ = zip(*b) orientation = get_orientation(ia, ib) palette[ block_id] = reverse_color if orientation == "-" else forward_color return cls(palettedict=palette)
def pairs(args): """ %prog pairs anchorsfile prefix Convert anchorsfile to pairsfile. """ p = OptionParser(pairs.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) anchorfile, prefix = args outfile = prefix + ".pairs" fw = open(outfile, "w") af = AnchorFile(anchorfile) blocks = af.blocks pad = len(str(len(blocks))) npairs = 0 for i, block in enumerate(blocks): block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad) lines = [] for q, s, score in block: npairs += 1 score = score.replace("L", "") lines.append("\t".join((q, s, score, block_id))) print("\n".join(sorted(lines)), file=fw) fw.close() logging.debug("A total of {0} pairs written to `{1}`.".format( npairs, outfile))
def filter_exclude(blast_list, exclude=None): """ Filter gene pairs from an excluded list Args: blast_list (List[BlastLine]): List of BlastLines exclude (str, optional): Path to the excluded anchors file. Defaults to None. """ from jcvi.compara.synteny import AnchorFile excluded_pairs = set() ac = AnchorFile(exclude) for a, b, block in ac.iter_pairs(): excluded_pairs.add((a, b)) excluded_pairs.add((b, a)) for b in blast_list: if (b.query, b.subject) in excluded_pairs: continue yield b
def anchor2tsv(args): anchors = AnchorFile(args.fi) blocks = anchors.blocks i = 1 fmt = "b%%0%dd" % ndigit(len(blocks)) print("\t".join('bid gid1 gid2 score'.split())) for block in blocks: bid = fmt % i for line in block: a, b, score = line print("\t".join((bid, a, b, score))) i += 1
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] aligned_genes = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): aligned_genes.join(a, b) print list(aligned_genes) logging.debug("Total aligned genes: {}".format(len(aligned_genes)))
def read_clusters(qa_file, qorder, sorder): af = AnchorFile(qa_file) blocks = af.blocks clusters = [] for block in blocks: cluster = [] for a, b, score in block: ia, oa = qorder[a] ib, ob = sorder[b] ca, cb = oa.seqid, ob.seqid cluster.append(((ca, ia), (cb, ib), score)) clusters.append(cluster) return clusters
def zipbed(args): """ %prog zipbed species.bed collinear.anchors Build ancestral contig from collinear blocks. For example, to build pre-rho order, use `zipbed rice.bed rice.rice.1x1.collinear.anchors`. The algorithms proceeds by interleaving the genes together. """ p = OptionParser(zipbed.__doc__) p.add_option("--prefix", default="b", help="Prefix for the new seqid [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args prefix = opts.prefix bed = Bed(bedfile) order = bed.order newbedfile = prefix + ".bed" fw = open(newbedfile, "w") af = AnchorFile(anchorfile) blocks = af.blocks pad = len(str(len(blocks))) for i, block in enumerate(blocks): block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad) pairs = [] for q, s, score in block: qi, q = order[q] si, s = order[s] pairs.append((qi, si)) newbed = list(interleave_pairs(pairs)) for i, b in enumerate(newbed): accn = bed[b].accn print("\t".join(str(x) for x in (block_id, i, i + 1, accn)), file=fw) logging.debug("Reconstructed bedfile written to `{0}`.".format(newbedfile))
def collinear(args): """ %prog collinear a.b.anchors Reduce synteny blocks to strictly collinear, use dynamic programming in a procedure similar to DAGchainer. """ p = OptionParser(collinear.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors" fw = open(newanchorfile, "w") blocks = af.blocks for block in blocks: print("#" * 3, file=fw) iblock = [] for q, s, score in block: qi, q = qorder[q] si, s = sorder[s] score = get_number(score) iblock.append([qi, si, score]) block = get_collinear(iblock) for q, s, score in block: q = qbed[q].accn s = sbed[s].accn print("\t".join((q, s, str(score))), file=fw) fw.close()
def mergechrom(args): """ %prog mergechrom a.b.anchors merge synteny blocks on the same chromosome """ p = OptionParser(mergechrom.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors" fw = open(newanchorfile, "w") qchrom_dic = dict((b.accn, b.seqid) for b in qbed) schrom_dic = dict((b.accn, b.seqid) for b in sbed) block_dic = dict() blocks = af.blocks for (i, block) in enumerate(blocks): q, s, score = block[0] qchrom, schrom = qchrom_dic[q], schrom_dic[s] k = "%s_%s" % (qchrom, schrom) if k not in block_dic: block_dic[k] = [] block_dic[k].append(i) for (k, idxs) in block_dic.items(): print("#" * 3, file=fw) for i in idxs: for q, s, score in blocks[i]: print("\t".join((q, s, str(score))), file=fw) fw.close() print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
def read_clusters(qa_file, qorder, sorder): """Read in the clusters from anchors file Args: qa_file (str): Path to input file qorder (dict): Dictionary to find position of feature in query sorder (dict): Dictionary to find position of feature in subject Returns: List: List of matches and scores """ af = AnchorFile(qa_file) blocks = af.blocks clusters = [] for block in blocks: cluster = [] for a, b, score in block: ia, oa = qorder[a] ib, ob = sorder[b] ca, cb = oa.seqid, ob.seqid cluster.append(((ca, ia), (cb, ib), score)) clusters.append(cluster) return clusters
def dotplot_main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=2, help="Maximum value in the colormap [default: %default]") p.add_option("--genomenames", type="string", default=None, help="genome names for labeling axes in the form of qname_sname, " \ "eg. \"Vitis vinifera_Oryza sativa\"") p.add_option("--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot [default: %default]") p.add_option("--minfont", type="int", default=4, help="Do not render labels with size smaller than") p.add_option("--colormap", help="Two column file, block id to color mapping [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the seqids along the axes") p.add_option("--nosep", default=False, action="store_true", help="Do not add contig lines") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--skipempty", default=False, action="store_true", help="Skip seqids that do not have matches") p.add_option("--title", help="Title of the dot plot") p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args cmaptext = opts.cmaptext if anchorfile.endswith(".ks"): from jcvi.apps.ks import KsFile logging.debug("Anchors contain Ks values") cmaptext = cmaptext or "*Ks* values" anchorksfile = anchorfile + ".anchors" if need_update(anchorfile, anchorksfile): ksfile = KsFile(anchorfile) ksfile.print_to_anchors(anchorksfile) anchorfile = anchorksfile qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts, sorted=(not opts.nosort)) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self, synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap, genomenames=opts.genomenames, sample_number=opts.sample_number, minfont=opts.minfont, palette=palette, sep=(not opts.nosep), title=opts.title, stdpf=(not opts.nostdpf)) image_name = opts.outfile or \ (op.splitext(anchorfile)[0] + "." + opts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format( len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = zip(*block)[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
help="Skip seqids that do not have matches") opts, args, iopts = p.set_image_options(sys.argv[1:], figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else:
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = { (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), } def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option( "--minsize", default=10, type="int", help="Only segment using blocks >= size" ) p.add_option( "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary" ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = list(zip(*block))[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in range(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
figsize="8x8", style="dark", dpi=90) if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: