def print_gffline(self, fw, f, seqid, parent=None): score = phase = "." type = f.type if type == "source": type = "contig" attr = "ID=tmp" source = self.source start = get_number(f.location.start) + 1 end = get_number(f.location.end) strand = '-' if f.strand < 0 else '+' g = "\t".join(str(x) for x in \ (seqid, source, type, start, end, score, strand, phase, attr)) g = GffLine(g) qual = f.qualifiers id = "tmp" if MT in qual: id = seqid elif LT in qual: id, = qual[LT] else: qual[LT] = [self.current_id] id, = qual[LT] id = id.split()[0] if parent: id, = parent.qualifiers[LT] id = id.split()[0] if type == 'CDS': parent_id = id self.counter[id] += 1 suffix = ".cds.{0}".format(self.counter[id]) id = parent_id + suffix g.attributes["Parent"] = [parent_id] assert id != "tmp", f g.attributes["ID"] = [id] if type == "mRNA": g.attributes["Name"] = g.attributes["ID"] if "product" in qual: note, = qual["product"] g.attributes["Note"] = [note] if "pseudo" in qual: note = "Pseudogene" g.attributes["Note"] = [note] g.update_attributes() print >> fw, g self.current_id = id
def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None
def print_gffline(self, fw, f, seqid, parent=None): score = phase = "." type = f.type if type == "source": type = "contig" attr = "ID=tmp" source = self.source start = get_number(f.location.start) + 1 end = get_number(f.location.end) strand = '-' if f.strand < 0 else '+' g = "\t".join(str(x) for x in \ (seqid, source, type, start, end, score, strand, phase, attr)) g = GffLine(g) qual = f.qualifiers id = "tmp" if MT in qual: id = seqid elif LT in qual: id, = qual[LT] else: qual[LT] = [self.current_id] id, = qual[LT] id = id.split()[0] if parent: id, = parent.qualifiers[LT] id = id.split()[0] assert id != "tmp", f oid = id self.counter[(oid, type)].append((start, end)) count = len(self.counter[(oid, type)]) if type in ("mRNA", "gene"): if type == "gene" and count > 1: return self.start = min(a for a, b in self.counter[(id, type)]) self.end = max(a for a, b in self.counter[(id, type)]) self.set_attribute("gene", "Alias", qual, g) self.set_attribute("product", "Note", qual, g) else: suffix = ".{0}.{1}".format(type.lower(), count) id = id + suffix g.attributes["Parent"] = [oid] self.set_attribute("product", "Note", qual, g) g.attributes["ID"] = [id] g.update_attributes() print(g, file=fw) self.current_id = oid
def print_gffline(self, fw, f, seqid, parent=None): score = phase = "." type = f.type if type == "source": type = "contig" attr = "ID=tmp" source = self.source start = get_number(f.location.start) + 1 end = get_number(f.location.end) strand = '-' if f.strand < 0 else '+' g = "\t".join(str(x) for x in \ (seqid, source, type, start, end, score, strand, phase, attr)) g = GffLine(g) qual = f.qualifiers id = "tmp" if MT in qual: id = seqid elif LT in qual: id, = qual[LT] else: qual[LT] = [self.current_id] id, = qual[LT] id = id.split()[0] if parent: id, = parent.qualifiers[LT] id = id.split()[0] assert id != "tmp", f oid = id self.counter[(oid, type)].append((start, end)) count = len(self.counter[(oid, type)]) if type in ("mRNA", "gene"): if type == "gene" and count > 1: return self.start = min(a for a, b in self.counter[(id, type)]) self.end = max(a for a, b in self.counter[(id, type)]) self.set_attribute("gene", "Alias", qual, g) self.set_attribute("product", "Note", qual, g) else: suffix = ".{0}.{1}".format(type.lower(), count) id = id + suffix g.attributes["Parent"] = [oid] self.set_attribute("product", "Note", qual, g) g.attributes["ID"] = [id] g.update_attributes() print >> fw, g self.current_id = oid
def atg_name(name, retval="chr,rank", trimpad0=True): atg_name_pat = re.compile( r""" ^(?P<locus> (?:(?P<prefix>\D+[\D\d\D])\.?)(?P<chr>[\d|C|M]+)(?P<sep>[A-z]+)(?P<rank>\d+) ) \.?(?P<iso>\d+)? """, re.VERBOSE) seps = ["g", "te", "trna", "s", "u"] pad0s = ["rank"] if name is not None: m = re.match(atg_name_pat, name) if m is not None and m.group('sep').lower() in seps: retvals = [] for grp in retval.split(","): if grp == 'chr': val = chr_number(m.group(grp)) else: val = get_number(m.group(grp)) \ if trimpad0 and grp in pad0s \ else m.group(grp) retvals.append(val) return (x for x in retvals) if len(retvals) > 1 \ else retvals[0] return (None for x in retval.split(","))
def coge(args): """ %prog coge *.gff Prepare coge datasets. """ p = OptionParser(coge.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gffs = args for gff in gffs: atoms = op.basename(gff).split(".") gid = atoms[-2] assert gid.startswith("gid") gid = get_number(gid) genomefasta = "genome_{0}.faa.fasta".format(gid) species = "_".join(atoms[0].split("_")[:2]) cdsfasta = species + ".cds.fasta" load([ gff, genomefasta, "--id_attribute=Parent", "--outfile={0}".format(cdsfasta), ])
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def remove_isoforms(ids): """ This is more or less a hack to remove the GMAP multiple mappings. Multiple GMAP mappings can be seen given the names .mrna1, .mrna2, etc. """ key = lambda x: x.rsplit(".", 1)[0] iso_number = lambda x: get_number(x.split(".")[-1]) ids = sorted(ids, key=key) newids = [] for k, ii in groupby(ids, key=key): min_i = min(list(ii), key=iso_number) newids.append(min_i) return newids
def atg_name(name, retval="chr,rank", trimpad0=True): seps = ["g", "te", "trna", "s", "u", "nc"] pad0s = ["rank"] if name is not None: m = re.match(atg_name_pat, name) if m is not None and m.group("sep").lower() in seps: retvals = [] for grp in retval.split(","): if grp == "chr": val = chr_number(m.group(grp)) else: val = (get_number(m.group(grp)) if trimpad0 and grp in pad0s else m.group(grp)) retvals.append(val) return (x for x in retvals) if len(retvals) > 1 else retvals[0] return (None for _ in retval.split(","))
def annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits): current_chr = get_number(chr) for line in chrbed: accn = line.accn if accn not in g or (opts.atg_name and "chr" not in chr): abedline[accn] = line continue gaccns = g[accn] new = [a for a in gaccns if re.search(new_id_pat, a)] newgrp = ";".join(sorted(new)) if accn in scores: scores[accn] = sorted(scores[accn], key=lambda x: x[1]) scores[accn] = sorted(scores[accn], key=lambda x: float(x[3]), reverse=True) accns = [] print >> sys.stderr, accn for elem in scores[accn]: print >> sys.stderr, "\t" + ", ".join([str(x)\ for x in elem[1:]]) if opts.atg_name: achr, arank = atg_name(elem[1]) if not achr or achr != current_chr: continue accns.append(elem[1]) if len(new) > 1: if newgrp not in scores: scores[newgrp] = [] scores[newgrp].append(elem) else: accns[0:0] = [accn] line.accn = ";".join([str(x) for x in accns]) if len(scores[accn]) > 1: break if len(new) > 1: splits.add(newgrp) else: abedline[line.accn] = line return abedline, splits
def atg_name(name, retval="chr,rank", trimpad0=True): seps = ["g", "te", "trna", "s", "u", "nc"] pad0s = ["rank"] if name is not None: m = re.match(atg_name_pat, name) if m is not None and m.group('sep').lower() in seps: retvals = [] for grp in retval.split(","): if grp == 'chr': val = chr_number(m.group(grp)) else: val = get_number(m.group(grp)) \ if trimpad0 and grp in pad0s \ else m.group(grp) retvals.append(val) return (x for x in retvals) if len(retvals) > 1 \ else retvals[0] return (None for x in retval.split(","))
def collinear(args): """ %prog collinear a.b.anchors Reduce synteny blocks to strictly collinear, use dynamic programming in a procedure similar to DAGchainer. """ p = OptionParser(collinear.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors" fw = open(newanchorfile, "w") blocks = af.blocks for block in blocks: print("#" * 3, file=fw) iblock = [] for q, s, score in block: qi, q = qorder[q] si, s = sorder[s] score = get_number(score) iblock.append([qi, si, score]) block = get_collinear(iblock) for q, s, score in block: q = qbed[q].accn s = sbed[s].accn print("\t".join((q, s, str(score))), file=fw) fw.close()
def coge(args): """ %prog coge *.gff Prepare coge datasets. """ p = OptionParser(coge.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gffs = args for gff in gffs: atoms = op.basename(gff).split(".") gid = atoms[-2] assert gid.startswith("gid") gid = get_number(gid) genomefasta = "genome_{0}.faa.fasta".format(gid) species = "_".join(atoms[0].split("_")[:2]) cdsfasta = species + ".cds.fasta" load([gff, genomefasta, "--id_attribute=Parent", "--outfile={0}".format(cdsfasta)])
def atg_name(name, retval="chr,rank", trimpad0=True): atg_name_pat = re.compile(r""" ^(?P<locus> (?:(?P<prefix>\D+[\D\d\D])\.?)(?P<chr>[\d|C|M]+)?(?P<sep>[A-z]+)(?P<rank>\d+) ) \.?(?P<iso>\d+)? """, re.VERBOSE) seps = ["g", "te", "trna", "s", "u"] pad0s = ["chr", "rank"] if name is not None: m = re.match(atg_name_pat, name) if m is not None and m.group('sep').lower() in seps: retvals = [] for grp in retval.split(","): val = get_number(m.group(grp)) \ if trimpad0 and grp in pad0s \ else m.group(grp) retvals.append(val) return (x for x in retvals) else: return (None for x in retval.split(","))
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) if len(args) < 1: sys.exit(not p.print_help()) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue (line, ) = A.plot(x, y, "-", lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K, method="allpaths") genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc="w") t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B")) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def ancestral(args): """ %prog ancestral ancestral.txt assembly.fasta Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper Figure 3 and Tetradon paper Figure 9. """ p = OptionParser(ancestral.__doc__) opts, args, iopts = p.set_image_options(args, figsize="8x7") if len(args) != 2: sys.exit(not p.print_help()) regionsfile, sizesfile = args regions = RegionsFile(regionsfile) sizes = Sizes(sizesfile).mapping sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG") maxsize = max(sizes.values()) ratio = .5 / maxsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes((0, 0, 1, 1)) from jcvi.graphics.base import set2 a, b, c, d, e, f, g = set2[:7] set2 = (c, g, b, e, d, a, f) # Upper panel is the evolution of segments # All segments belong to one of seven karyotypes 1 to 7 karyotypes = regions.karyotypes xgap = 1. / (1 + len(karyotypes)) ygap = .05 mgap = xgap / 4.5 gwidth = mgap * .75 tip = .02 coords = {} for i, k in enumerate(regions.karyotypes): x = (i + 1) * xgap y = .9 root.text(x, y + tip, "Anc" + k, ha="center") root.plot((x, x), (y, y - ygap), "k-", lw=2) y -= 2 * ygap coords['a'] = (x - 1.5 * mgap , y) coords['b'] = (x - .5 * mgap , y) coords['c'] = (x + .5 * mgap , y) coords['d'] = (x + 1.5 * mgap , y) coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2) coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2) coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap) for n in 'abcd': nx, ny = coords[n] root.text(nx, ny - tip, n, ha="center") coords[n] = (nx, ny - ygap / 2) kdata = regions.get_karyotype(k) for kd in kdata: g = kd.group gx, gy = coords[g] gsize = ratio * kd.span gy -= gsize p = Rectangle((gx - gwidth / 2, gy), gwidth, gsize, lw=0, color=set2[i]) root.add_patch(p) root.text(gx, gy + gsize / 2, kd.chromosome, ha="center", va="center", color='w') coords[g] = (gx, gy - tip) # Bottom panel shows the location of segments on chromosomes # TODO: redundant code, similar to graphics.chromosome ystart = .54 chr_number = len(sizes) xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap xinterval = (xend - xstart - gwidth) / (chr_number - 1) chrpos = {} for a, (chr, clen) in enumerate(sorted(sizes.items())): chr = get_number(chr) xx = xstart + a * xinterval + gwidth / 2 chrpos[chr] = xx root.text(xx, ystart + .01, chr, ha="center") Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth) # Start painting for r in regions: xx = chrpos[r.chromosome] yystart = ystart - r.start * ratio yyend = ystart - r.end * ratio p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart, color=set2[int(r.karyotype) - 1], lw=0) root.add_patch(p) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() pf = "pineapple-karyotype" image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue line, = A.plot(x, y, "-", lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K) genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc="w") t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B")) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option("--prefix", default="Medtr", help="Genome prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): if "chr" not in chr: continue current_chr = get_number(chr) ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x) for x in lranks) | \ set(gene_name(current_chr, x, sep="te") for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def allocate(self, info, chr, start_id, end_id, id_table): start_bp = info[0].start end_bp = info[-1].end current_chr = get_number(chr) needed = info assert end_id > start_id, \ "end ({0}) > start ({1})".format(end_id, start_id) spots = end_id - start_id - 1 available = [x for x in xrange(start_id + 1, end_id) if (current_chr, x) not in self.black] message = "chr{0} need {1} ids, has {2} spots ({3} available)".\ format(current_chr, len(needed), spots, len(available)) start_gene = gene_name(current_chr, start_id) end_gene = gene_name(current_chr, end_id) message += " between {0} - {1}\n".format(start_gene, end_gene) assert end_bp > start_bp b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp)) cmd = "echo '{0}' |".format(b) cmd += " intersectBed -a {0} -b stdin".format(self.gapfile) gaps = list(BedLine(x) for x in popen(cmd, debug=False)) ngaps = len(gaps) gapsexpanded = [] GeneDensity = 10000. # assume 10Kb per gene for gap in gaps: gap_bp = int(gap.score) gap_ids = int(round(gap_bp / GeneDensity)) gapsexpanded += [gap] * gap_ids lines = sorted(info + gapsexpanded, key=lambda x: x.start) message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\ format(start_bp, end_bp, ngaps, len(lines)) needed = lines stride = Stride(needed, available) conf = stride.conf message += " stride: {0}".format(conf) print >> sys.stderr, message nneeded = len(needed) if conf is None: # prefix rule - prepend version number for spills magic = 400000 # version 4 firstdigit = 100000 step = 10 # stride for the prefixed ids rank = start_id + magic if rank > magic + firstdigit: rank -= firstdigit available = [] while len(available) != nneeded: rank += step if (current_chr, rank) in self.black: # avoid blacklisted ids continue available.append(rank) else: # follow the best stride available = stride.available if start_id == 0: # follow right flank at start of chr available = available[- nneeded:] else: # follow left flank otherwise available = available[:nneeded] # Finally assign the ids assert len(needed) == len(available) for b, rank in zip(needed, available): name = gene_name(current_chr, rank) print >> sys.stderr, "\t".join((str(b), name)) id_table[b.accn] = name self.black.add((current_chr, rank)) print >> sys.stderr
def draw_chromosomes( root, bedfile, sizes, iopts, mergedist, winsize, imagemap, mappingfile=None, gauge=False, legend=True, empty=False, title=None, ): bed = Bed(bedfile) prefix = bedfile.rsplit(".", 1)[0] if imagemap: imgmapfile = prefix + ".map" mapfh = open(imgmapfile, "w") print('<map id="' + prefix + '">', file=mapfh) if mappingfile: mappings = DictFile(mappingfile, delimiter="\t") classes = sorted(set(mappings.values())) preset_colors = (DictFile( mappingfile, keypos=1, valuepos=2, delimiter="\t") if DictFile.num_columns(mappingfile) >= 3 else {}) else: classes = sorted(set(x.accn for x in bed)) mappings = dict((x, x) for x in classes) preset_colors = {} logging.debug("A total of {} classes found: {}".format( len(classes), ",".join(classes))) # Assign colors to classes ncolors = max(3, min(len(classes), 12)) palette = set1_n if ncolors <= 8 else set3_n colorset = palette(number=ncolors) colorset = sample_N(colorset, len(classes)) class_colors = dict(zip(classes, colorset)) class_colors.update(preset_colors) logging.debug("Assigned colors: {}".format(class_colors)) chr_lens = {} centromeres = {} if sizes: chr_lens = Sizes(sizes).sizes_mapping else: for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = "-" chr_number = len(chr_lens) if centromeres: assert chr_number == len( centromeres), "chr_number = {}, centromeres = {}".format( chr_number, centromeres) r = 0.7 # width and height of the whole chromosome set xstart, ystart = 0.15, 0.85 xinterval = r / chr_number xwidth = xinterval * 0.5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, clen) in enumerate(sorted(chr_lens.items())): xx = xstart + a * xinterval + 0.5 * xwidth root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center") if centromeres: yy = ystart - centromeres[chr] * ratio ChromosomeWithCentromere(root, xx, ystart, yy, ystart - clen * ratio, width=xwidth) else: Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = 1 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] prev_end, prev_klass = 0, None for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn if klass == "centromere": continue start = b.start end = b.end if start < prev_end + mergedist and klass == prev_klass: start = prev_end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch( Rectangle( (xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "lightslategray"), lw=0, alpha=alpha, )) prev_end, prev_klass = b.end, klass if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) if imagemap: print("</map>", file=mapfh) mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if gauge: xstart, ystart = 0.9, 0.85 Gauge(root, xstart, ystart - r, ystart, max_chr_len) if "centromere" in class_colors: del class_colors["centromere"] # class legends, four in a row if legend: xstart = 0.1 xinterval = 0.8 / len(class_colors) xwidth = 0.04 yy = 0.08 for klass, cc in sorted(class_colors.items()): if klass == "-": continue root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha)) root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10) xstart += xinterval if empty: root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1)) root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10) if title: root.text(0.5, 0.95, markup(title), ha="center", va="center")