def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) add_beds(p) p.add_option("--cutoff", default=.3, type="float", help="The clustering cutoff to call similar [default: %default]") opts, args = p.parse_args(args) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = - log(pvalue_cutoff) significant = [] for i in xrange(m): for j in xrange(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print "|".join(a), "|".join(b), score logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\ format(len(significant), pvalue_cutoff))
if __name__ == "__main__": p = OptionParser(__doc__) add_beds(p) p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmap", default="Synonymous substitutions (Ks)", help="Draw colormap box on the bottom-left corner " "[default: `%default`]") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=1, help="Maximum value in the colormap [default: %default]") opts, args, iopts = set_image_options(p, sys.argv[1:], figsize="8x8", dpi=90) if len(args) != 1: sys.exit(not p.print_help()) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) synteny = opts.synteny vmin, vmax = opts.vmin, opts.vmax cmap_text = opts.cmap anchorfile = args[0] image_name = op.splitext(anchorfile)[0] + "." + opts.format dotplot(anchorfile, qbed, sbed, image_name, vmin, vmax, iopts, is_self=is_self, synteny=synteny, cmap_text=cmap_text)
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) add_beds(p) p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for q, s in ac.iter_blocks(minsize=minsize): q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print >> fw, "\t".join(header) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print >> fw, "\t".join(row) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def main(blast_file, opts): qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) tandem_Nmax = opts.tandem_Nmax filter_repeats = opts.filter_repeats cscore = opts.cscore fp = file(blast_file) total_lines = sum(1 for line in fp) logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if not tandem_Nmax is None: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) if filter_repeats: before_filter = len(filtered_blasts) logging.debug("running the repeat filter") filtered_blasts = list(filter_repeat(filtered_blasts)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if not cscore is None: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
def main(blast_file, opts): qbed, sbed, qorder, sorder, is_self = check_beds(p, opts) tandem_Nmax = opts.tandem_Nmax filter_repeats = opts.filter_repeats cscore = opts.cscore fp = file(blast_file) total_lines = sum(1 for line in fp) logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format( subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if not tandem_Nmax is None: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) if filter_repeats: before_filter = len(filtered_blasts) logging.debug("running the repeat filter") filtered_blasts = list(filter_repeat(filtered_blasts)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if not cscore is None: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()