def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) add_beds(p) p.add_option("--cutoff", default=.3, type="float", help="The clustering cutoff to call similar [default: %default]") opts, args = p.parse_args(args) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = - log(pvalue_cutoff) significant = [] for i in xrange(m): for j in xrange(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print "|".join(a), "|".join(b), score logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\ format(len(significant), pvalue_cutoff))
set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() logging.debug("Print image to `{0}` {1}".format(image_name, iopts)) plt.savefig(image_name, dpi=iopts.dpi) if __name__ == "__main__": p = OptionParser(__doc__) add_beds(p) p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmap", default="Synonymous substitutions (Ks)", help="Draw colormap box on the bottom-left corner " "[default: `%default`]") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=1, help="Maximum value in the colormap [default: %default]") opts, args, iopts = set_image_options(p, sys.argv[1:], figsize="8x8", dpi=90) if len(args) != 1: sys.exit(not p.print_help()) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts)
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) add_beds(p) p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for q, s in ac.iter_blocks(minsize=minsize): q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print >> fw, "\t".join(header) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print >> fw, "\t".join(row) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
# these are already sorted. hits = [x[1] for x in hits] for ia, a in enumerate(hits[:-1]): b = hits[ia + 1] # on the same chr and rank difference no larger than tandem_Nmax if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: standems.join(a[1], b[1]) return standems if __name__ == "__main__": import optparse p = optparse.OptionParser(__doc__) add_beds(p) p.add_option("--no_strip_names", dest="strip_names", action="store_false", default=True, help="do not strip alternative splicing " "(e.g. At5g06540.1 -> At5g06540)") p.add_option("--tandems_only", dest="tandems_only", action="store_true", default=False, help="only calculate tandems, write .localdup file and exit.") filter_group = optparse.OptionGroup(p, "BLAST filters") filter_group.add_option( "--tandem_Nmax",