def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}".format( len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) print_edges(bed, families)
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}" .format(len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk G = BiGraph() for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) #add_bed_to_graph(G, bed, families) print_edges(G, bed, families)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def filter_exclude(blast_list, exclude=None): """ Filter gene pairs from an excluded list Args: blast_list (List[BlastLine]): List of BlastLines exclude (str, optional): Path to the excluded anchors file. Defaults to None. """ from jcvi.compara.synteny import AnchorFile excluded_pairs = set() ac = AnchorFile(exclude) for a, b, block in ac.iter_pairs(): excluded_pairs.add((a, b)) excluded_pairs.add((b, a)) for b in blast_list: if (b.query, b.subject) in excluded_pairs: continue yield b
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] aligned_genes = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): aligned_genes.join(a, b) print list(aligned_genes) logging.debug("Total aligned genes: {}".format(len(aligned_genes)))
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def dotplot_main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=2, help="Maximum value in the colormap [default: %default]") p.add_option("--genomenames", type="string", default=None, help="genome names for labeling axes in the form of qname_sname, " \ "eg. \"Vitis vinifera_Oryza sativa\"") p.add_option("--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot [default: %default]") p.add_option("--minfont", type="int", default=4, help="Do not render labels with size smaller than") p.add_option("--colormap", help="Two column file, block id to color mapping [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the seqids along the axes") p.add_option("--nosep", default=False, action="store_true", help="Do not add contig lines") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--skipempty", default=False, action="store_true", help="Skip seqids that do not have matches") p.add_option("--title", help="Title of the dot plot") p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args cmaptext = opts.cmaptext if anchorfile.endswith(".ks"): from jcvi.apps.ks import KsFile logging.debug("Anchors contain Ks values") cmaptext = cmaptext or "*Ks* values" anchorksfile = anchorfile + ".anchors" if need_update(anchorfile, anchorksfile): ksfile = KsFile(anchorfile) ksfile.print_to_anchors(anchorksfile) anchorfile = anchorksfile qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts, sorted=(not opts.nosort)) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self, synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap, genomenames=opts.genomenames, sample_number=opts.sample_number, minfont=opts.minfont, palette=palette, sep=(not opts.nosep), title=opts.title, stdpf=(not opts.nostdpf)) image_name = opts.outfile or \ (op.splitext(anchorfile)[0] + "." + opts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format( len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) image_name = op.splitext(anchorfile)[0] + "." + opts.format dotplot_main(anchorfile, qbed,
palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) image_name = op.splitext(anchorfile)[0] + "." + opts.format dotplot_main(anchorfile, qbed, sbed, image_name, iopts, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self,