Example #1
0
    def print_gffline(self, fw, f, seqid, parent=None):

        score = phase = "."
        type = f.type
        if type == "source":
            type = "contig"

        attr = "ID=tmp"
        source = self.source

        start = get_number(f.location.start) + 1
        end = get_number(f.location.end)
        strand = '-' if f.strand < 0 else '+'
        g = "\t".join(str(x) for x in \
            (seqid, source, type, start, end, score, strand, phase, attr))
        g = GffLine(g)

        qual = f.qualifiers
        id = "tmp"
        if MT in qual:
            id = seqid
        elif LT in qual:
            id, = qual[LT]
        else:
            qual[LT] = [self.current_id]
            id, = qual[LT]

        id = id.split()[0]

        if parent:
            id, = parent.qualifiers[LT]
            id = id.split()[0]

        if type == 'CDS':
            parent_id = id
            self.counter[id] += 1
            suffix = ".cds.{0}".format(self.counter[id])
            id = parent_id + suffix
            g.attributes["Parent"] = [parent_id]

        assert id != "tmp", f
        g.attributes["ID"] = [id]

        if type == "mRNA":
            g.attributes["Name"] = g.attributes["ID"]
            if "product" in qual:
                note, = qual["product"]
                g.attributes["Note"] = [note]

            if "pseudo" in qual:
                note = "Pseudogene"
                g.attributes["Note"] = [note]

        g.update_attributes()
        print >> fw, g

        self.current_id = id
Example #2
0
 def get_target(achr, bchr):
     if "chr" not in achr and "chr" not in bchr:
         return None
     achr, bchr = get_number(achr), get_number(bchr)
     if achr > bchr:
         achr, bchr = bchr, achr
     if (achr, bchr) in target_pairs:
         return achr, bchr
     return None
Example #3
0
    def print_gffline(self, fw, f, seqid, parent=None):

        score = phase = "."
        type = f.type
        if type == "source":
            type = "contig"

        attr = "ID=tmp"
        source = self.source

        start = get_number(f.location.start) + 1
        end = get_number(f.location.end)
        strand = '-' if f.strand < 0 else '+'
        g = "\t".join(str(x) for x in \
            (seqid, source, type, start, end, score, strand, phase, attr))
        g = GffLine(g)

        qual = f.qualifiers
        id = "tmp"
        if MT in qual:
            id = seqid
        elif LT in qual:
            id, = qual[LT]
        else:
            qual[LT] = [self.current_id]
            id, = qual[LT]

        id = id.split()[0]

        if parent:
            id, = parent.qualifiers[LT]
            id = id.split()[0]

        assert id != "tmp", f
        oid = id
        self.counter[(oid, type)].append((start, end))
        count = len(self.counter[(oid, type)])

        if type in ("mRNA", "gene"):
            if type == "gene" and count > 1:
                return
            self.start = min(a for a, b in self.counter[(id, type)])
            self.end = max(a for a, b in self.counter[(id, type)])
            self.set_attribute("gene", "Alias", qual, g)
            self.set_attribute("product", "Note", qual, g)
        else:
            suffix = ".{0}.{1}".format(type.lower(), count)
            id = id + suffix
            g.attributes["Parent"] = [oid]
            self.set_attribute("product", "Note", qual, g)

        g.attributes["ID"] = [id]
        g.update_attributes()
        print(g, file=fw)

        self.current_id = oid
Example #4
0
    def print_gffline(self, fw, f, seqid, parent=None):

        score = phase = "."
        type = f.type
        if type == "source":
            type = "contig"

        attr = "ID=tmp"
        source = self.source

        start = get_number(f.location.start) + 1
        end = get_number(f.location.end)
        strand = '-' if f.strand < 0 else '+'
        g = "\t".join(str(x) for x in \
            (seqid, source, type, start, end, score, strand, phase, attr))
        g = GffLine(g)

        qual = f.qualifiers
        id = "tmp"
        if MT in qual:
            id = seqid
        elif LT in qual:
            id, = qual[LT]
        else:
            qual[LT] = [self.current_id]
            id, = qual[LT]

        id = id.split()[0]

        if parent:
            id, = parent.qualifiers[LT]
            id = id.split()[0]

        assert id != "tmp", f
        oid = id
        self.counter[(oid, type)].append((start, end))
        count = len(self.counter[(oid, type)])

        if type in ("mRNA", "gene"):
            if type == "gene" and count > 1:
                return
            self.start = min(a for a, b in self.counter[(id, type)])
            self.end = max(a for a, b in self.counter[(id, type)])
            self.set_attribute("gene", "Alias", qual, g)
            self.set_attribute("product", "Note", qual, g)
        else:
            suffix = ".{0}.{1}".format(type.lower(), count)
            id = id + suffix
            g.attributes["Parent"] = [oid]
            self.set_attribute("product", "Note", qual, g)

        g.attributes["ID"] = [id]
        g.update_attributes()
        print >> fw, g

        self.current_id = oid
Example #5
0
def atg_name(name, retval="chr,rank", trimpad0=True):
    atg_name_pat = re.compile(
        r"""
            ^(?P<locus>
                (?:(?P<prefix>\D+[\D\d\D])\.?)(?P<chr>[\d|C|M]+)(?P<sep>[A-z]+)(?P<rank>\d+)
            )
            \.?(?P<iso>\d+)?
            """, re.VERBOSE)

    seps = ["g", "te", "trna", "s", "u"]
    pad0s = ["rank"]

    if name is not None:
        m = re.match(atg_name_pat, name)
        if m is not None and m.group('sep').lower() in seps:
            retvals = []
            for grp in retval.split(","):
                if grp == 'chr':
                    val = chr_number(m.group(grp))
                else:
                    val = get_number(m.group(grp)) \
                            if trimpad0 and grp in pad0s \
                            else m.group(grp)
                retvals.append(val)

            return (x for x in retvals) if len(retvals) > 1 \
                    else retvals[0]

    return (None for x in retval.split(","))
Example #6
0
def coge(args):
    """
    %prog coge *.gff

    Prepare coge datasets.
    """
    p = OptionParser(coge.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gffs = args
    for gff in gffs:
        atoms = op.basename(gff).split(".")
        gid = atoms[-2]
        assert gid.startswith("gid")
        gid = get_number(gid)
        genomefasta = "genome_{0}.faa.fasta".format(gid)
        species = "_".join(atoms[0].split("_")[:2])
        cdsfasta = species + ".cds.fasta"
        load([
            gff,
            genomefasta,
            "--id_attribute=Parent",
            "--outfile={0}".format(cdsfasta),
        ])
Example #7
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Example #8
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed, BedLine
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch",
                 default=False,
                 action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale",
                 type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        bedline = "\t".join(
            str(x) for x in (qseqid, qstart - 1, qend,
                             "{0}:{1}".format(get_number(sseqid), sstart)))
        bd.append(BedLine(bedline))

    bd.print_to_file(filename=opts.outfile, sorted=True)
Example #9
0
File: bed.py Project: yangjl/jcvi
def remove_isoforms(ids):
    """
    This is more or less a hack to remove the GMAP multiple mappings. Multiple
    GMAP mappings can be seen given the names .mrna1, .mrna2, etc.
    """
    key = lambda x: x.rsplit(".", 1)[0]
    iso_number = lambda x: get_number(x.split(".")[-1])
    ids = sorted(ids, key=key)
    newids = []
    for k, ii in groupby(ids, key=key):
        min_i = min(list(ii), key=iso_number)
        newids.append(min_i)
    return newids
Example #10
0
File: bed.py Project: radaniba/jcvi
def remove_isoforms(ids):
    """
    This is more or less a hack to remove the GMAP multiple mappings. Multiple
    GMAP mappings can be seen given the names .mrna1, .mrna2, etc.
    """
    key = lambda x: x.rsplit(".", 1)[0]
    iso_number = lambda x: get_number(x.split(".")[-1])
    ids = sorted(ids, key=key)
    newids = []
    for k, ii in groupby(ids, key=key):
        min_i = min(list(ii), key=iso_number)
        newids.append(min_i)
    return newids
Example #11
0
def atg_name(name, retval="chr,rank", trimpad0=True):
    seps = ["g", "te", "trna", "s", "u", "nc"]
    pad0s = ["rank"]

    if name is not None:
        m = re.match(atg_name_pat, name)
        if m is not None and m.group("sep").lower() in seps:
            retvals = []
            for grp in retval.split(","):
                if grp == "chr":
                    val = chr_number(m.group(grp))
                else:
                    val = (get_number(m.group(grp))
                           if trimpad0 and grp in pad0s else m.group(grp))
                retvals.append(val)

            return (x for x in retvals) if len(retvals) > 1 else retvals[0]

    return (None for _ in retval.split(","))
Example #12
0
def annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits):
    current_chr = get_number(chr)

    for line in chrbed:
        accn = line.accn
        if accn not in g or (opts.atg_name and "chr" not in chr):
            abedline[accn] = line
            continue

        gaccns = g[accn]
        new = [a for a in gaccns if re.search(new_id_pat, a)]
        newgrp = ";".join(sorted(new))

        if accn in scores:
            scores[accn] = sorted(scores[accn], key=lambda x: x[1])
            scores[accn] = sorted(scores[accn], key=lambda x: float(x[3]), reverse=True)

            accns = []
            print >> sys.stderr, accn
            for elem in scores[accn]:
                print >> sys.stderr, "\t" + ", ".join([str(x)\
                        for x in elem[1:]])
                if opts.atg_name:
                    achr, arank = atg_name(elem[1])
                    if not achr or achr != current_chr:
                        continue

                accns.append(elem[1])
                if len(new) > 1:
                    if newgrp not in scores: scores[newgrp] = []
                    scores[newgrp].append(elem)
                else:
                    accns[0:0] = [accn]
                    line.accn = ";".join([str(x) for x in accns])
                if len(scores[accn]) > 1: break

        if len(new) > 1:
            splits.add(newgrp)
        else:
            abedline[line.accn] = line

    return abedline, splits
Example #13
0
def atg_name(name, retval="chr,rank", trimpad0=True):
    seps = ["g", "te", "trna", "s", "u", "nc"]
    pad0s = ["rank"]

    if name is not None:
        m = re.match(atg_name_pat, name)
        if m is not None and m.group('sep').lower() in seps:
            retvals = []
            for grp in retval.split(","):
                if grp == 'chr':
                    val = chr_number(m.group(grp))
                else:
                    val = get_number(m.group(grp)) \
                            if trimpad0 and grp in pad0s \
                            else m.group(grp)
                retvals.append(val)

            return (x for x in retvals) if len(retvals) > 1 \
                    else retvals[0]

    return (None for x in retval.split(","))
Example #14
0
def collinear(args):
    """
    %prog collinear a.b.anchors

    Reduce synteny blocks to strictly collinear, use dynamic programming in a
    procedure similar to DAGchainer.
    """
    p = OptionParser(collinear.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors"
    fw = open(newanchorfile, "w")

    blocks = af.blocks
    for block in blocks:
        print("#" * 3, file=fw)
        iblock = []
        for q, s, score in block:
            qi, q = qorder[q]
            si, s = sorder[s]
            score = get_number(score)
            iblock.append([qi, si, score])

        block = get_collinear(iblock)

        for q, s, score in block:
            q = qbed[q].accn
            s = sbed[s].accn
            print("\t".join((q, s, str(score))), file=fw)

    fw.close()
Example #15
0
def coge(args):
    """
    %prog coge *.gff

    Prepare coge datasets.
    """
    p = OptionParser(coge.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gffs = args
    for gff in gffs:
        atoms = op.basename(gff).split(".")
        gid = atoms[-2]
        assert gid.startswith("gid")
        gid = get_number(gid)
        genomefasta = "genome_{0}.faa.fasta".format(gid)
        species = "_".join(atoms[0].split("_")[:2])
        cdsfasta = species + ".cds.fasta"
        load([gff, genomefasta, "--id_attribute=Parent",
              "--outfile={0}".format(cdsfasta)])
Example #16
0
def atg_name(name, retval="chr,rank", trimpad0=True):
    atg_name_pat = re.compile(r"""
            ^(?P<locus>
                (?:(?P<prefix>\D+[\D\d\D])\.?)(?P<chr>[\d|C|M]+)?(?P<sep>[A-z]+)(?P<rank>\d+)
            )
            \.?(?P<iso>\d+)?
            """, re.VERBOSE)

    seps = ["g", "te", "trna", "s", "u"]
    pad0s = ["chr", "rank"]

    if name is not None:
        m = re.match(atg_name_pat, name)
        if m is not None and m.group('sep').lower() in seps:
            retvals = []
            for grp in retval.split(","):
                val = get_number(m.group(grp)) \
                        if trimpad0 and grp in pad0s \
                        else m.group(grp)
                retvals.append(val)

            return (x for x in retvals)
    else:
        return (None for x in retval.split(","))
Example #17
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin",
                 default=15,
                 type="int",
                 help="Minimum K-mer size, inclusive")
    p.add_option("--kmax",
                 default=30,
                 type="int",
                 help="Maximum K-mer size, inclusive")
    p.add_option("--vmin",
                 default=2,
                 type="int",
                 help="Minimum value, inclusive")
    p.add_option("--vmax",
                 default=100,
                 type="int",
                 help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    if len(args) < 1:
        sys.exit(not p.print_help())

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        (line, ) = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K, method="allpaths")
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Example #18
0
def ancestral(args):
    """
    %prog ancestral ancestral.txt assembly.fasta

    Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper
    Figure 3 and Tetradon paper Figure 9.
    """
    p = OptionParser(ancestral.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="8x7")

    if len(args) != 2:
        sys.exit(not p.print_help())

    regionsfile, sizesfile = args
    regions = RegionsFile(regionsfile)
    sizes = Sizes(sizesfile).mapping
    sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG")
    maxsize = max(sizes.values())
    ratio = .5 / maxsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes((0, 0, 1, 1))

    from jcvi.graphics.base import set2
    a, b, c, d, e, f, g = set2[:7]
    set2 = (c, g, b, e, d, a, f)

    # Upper panel is the evolution of segments
    # All segments belong to one of seven karyotypes 1 to 7
    karyotypes = regions.karyotypes
    xgap = 1. / (1 + len(karyotypes))
    ygap = .05
    mgap = xgap / 4.5
    gwidth = mgap * .75
    tip = .02
    coords = {}
    for i, k in enumerate(regions.karyotypes):
        x = (i + 1) * xgap
        y = .9
        root.text(x, y + tip, "Anc" + k, ha="center")
        root.plot((x, x), (y, y - ygap), "k-", lw=2)
        y -= 2 * ygap
        coords['a'] = (x - 1.5 * mgap , y)
        coords['b'] = (x - .5 * mgap , y)
        coords['c'] = (x + .5 * mgap , y)
        coords['d'] = (x + 1.5 * mgap , y)
        coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2)
        coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2)
        coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap)
        for n in 'abcd':
            nx, ny = coords[n]
            root.text(nx, ny - tip, n, ha="center")
            coords[n] = (nx, ny - ygap / 2)

        kdata = regions.get_karyotype(k)
        for kd in kdata:
            g = kd.group
            gx, gy = coords[g]
            gsize = ratio * kd.span
            gy -= gsize
            p = Rectangle((gx - gwidth / 2, gy),
                           gwidth, gsize, lw=0, color=set2[i])
            root.add_patch(p)
            root.text(gx, gy + gsize / 2, kd.chromosome,
                      ha="center", va="center", color='w')
            coords[g] = (gx, gy - tip)

    # Bottom panel shows the location of segments on chromosomes
    # TODO: redundant code, similar to graphics.chromosome
    ystart = .54
    chr_number = len(sizes)
    xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap
    xinterval = (xend - xstart - gwidth) / (chr_number - 1)
    chrpos = {}
    for a, (chr, clen) in enumerate(sorted(sizes.items())):
        chr = get_number(chr)
        xx = xstart + a * xinterval + gwidth / 2
        chrpos[chr] = xx
        root.text(xx, ystart + .01, chr, ha="center")
        Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth)

    # Start painting
    for r in regions:
        xx = chrpos[r.chromosome]
        yystart = ystart - r.start * ratio
        yyend = ystart - r.end * ratio
        p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart,
                      color=set2[int(r.karyotype) - 1], lw=0)
        root.add_patch(p)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "pineapple-karyotype"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #19
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive")
    p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive")
    p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive")
    p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Example #20
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.add_option("--pad0", default=6, type="int",
                 help="Pad gene identifiers with 0 [default: %default]")
    p.add_option("--prefix", default="Medtr",
                 help="Genome prefix [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        if "chr" not in chr:
            continue

        current_chr = get_number(chr)
        ranks = []

        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x) for x in lranks) | \
                 set(gene_name(current_chr, x, sep="te") for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Example #21
0
def ancestral(args):
    """
    %prog ancestral ancestral.txt assembly.fasta

    Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper
    Figure 3 and Tetradon paper Figure 9.
    """
    p = OptionParser(ancestral.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="8x7")

    if len(args) != 2:
        sys.exit(not p.print_help())

    regionsfile, sizesfile = args
    regions = RegionsFile(regionsfile)
    sizes = Sizes(sizesfile).mapping
    sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG")
    maxsize = max(sizes.values())
    ratio = .5 / maxsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes((0, 0, 1, 1))

    from jcvi.graphics.base import set2
    a, b, c, d, e, f, g = set2[:7]
    set2 = (c, g, b, e, d, a, f)

    # Upper panel is the evolution of segments
    # All segments belong to one of seven karyotypes 1 to 7
    karyotypes = regions.karyotypes
    xgap = 1. / (1 + len(karyotypes))
    ygap = .05
    mgap = xgap / 4.5
    gwidth = mgap * .75
    tip = .02
    coords = {}
    for i, k in enumerate(regions.karyotypes):
        x = (i + 1) * xgap
        y = .9
        root.text(x, y + tip, "Anc" + k, ha="center")
        root.plot((x, x), (y, y - ygap), "k-", lw=2)
        y -= 2 * ygap
        coords['a'] = (x - 1.5 * mgap , y)
        coords['b'] = (x - .5 * mgap , y)
        coords['c'] = (x + .5 * mgap , y)
        coords['d'] = (x + 1.5 * mgap , y)
        coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2)
        coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2)
        coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap)
        for n in 'abcd':
            nx, ny = coords[n]
            root.text(nx, ny - tip, n, ha="center")
            coords[n] = (nx, ny - ygap / 2)

        kdata = regions.get_karyotype(k)
        for kd in kdata:
            g = kd.group
            gx, gy = coords[g]
            gsize = ratio * kd.span
            gy -= gsize
            p = Rectangle((gx - gwidth / 2, gy),
                           gwidth, gsize, lw=0, color=set2[i])
            root.add_patch(p)
            root.text(gx, gy + gsize / 2, kd.chromosome,
                      ha="center", va="center", color='w')
            coords[g] = (gx, gy - tip)

    # Bottom panel shows the location of segments on chromosomes
    # TODO: redundant code, similar to graphics.chromosome
    ystart = .54
    chr_number = len(sizes)
    xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap
    xinterval = (xend - xstart - gwidth) / (chr_number - 1)
    chrpos = {}
    for a, (chr, clen) in enumerate(sorted(sizes.items())):
        chr = get_number(chr)
        xx = xstart + a * xinterval + gwidth / 2
        chrpos[chr] = xx
        root.text(xx, ystart + .01, chr, ha="center")
        Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth)

    # Start painting
    for r in regions:
        xx = chrpos[r.chromosome]
        yystart = ystart - r.start * ratio
        yyend = ystart - r.end * ratio
        p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart,
                      color=set2[int(r.karyotype) - 1], lw=0)
        root.add_patch(p)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "pineapple-karyotype"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #22
0
    def allocate(self, info, chr, start_id, end_id, id_table):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = get_number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [x for x in xrange(start_id + 1, end_id) if
                            (current_chr, x) not in self.black]

        message = "chr{0} need {1} ids, has {2} spots ({3} available)".\
                format(current_chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id)
        end_gene = gene_name(current_chr, end_id)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None: # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else: # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[- nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr
Example #23
0
def draw_chromosomes(
    root,
    bedfile,
    sizes,
    iopts,
    mergedist,
    winsize,
    imagemap,
    mappingfile=None,
    gauge=False,
    legend=True,
    empty=False,
    title=None,
):
    bed = Bed(bedfile)
    prefix = bedfile.rsplit(".", 1)[0]

    if imagemap:
        imgmapfile = prefix + ".map"
        mapfh = open(imgmapfile, "w")
        print('<map id="' + prefix + '">', file=mapfh)

    if mappingfile:
        mappings = DictFile(mappingfile, delimiter="\t")
        classes = sorted(set(mappings.values()))
        preset_colors = (DictFile(
            mappingfile, keypos=1, valuepos=2, delimiter="\t")
                         if DictFile.num_columns(mappingfile) >= 3 else {})
    else:
        classes = sorted(set(x.accn for x in bed))
        mappings = dict((x, x) for x in classes)
        preset_colors = {}

    logging.debug("A total of {} classes found: {}".format(
        len(classes), ",".join(classes)))

    # Assign colors to classes
    ncolors = max(3, min(len(classes), 12))
    palette = set1_n if ncolors <= 8 else set3_n
    colorset = palette(number=ncolors)
    colorset = sample_N(colorset, len(classes))
    class_colors = dict(zip(classes, colorset))
    class_colors.update(preset_colors)
    logging.debug("Assigned colors: {}".format(class_colors))

    chr_lens = {}
    centromeres = {}
    if sizes:
        chr_lens = Sizes(sizes).sizes_mapping
    else:
        for b, blines in groupby(bed, key=(lambda x: x.seqid)):
            blines = list(blines)
            maxlen = max(x.end for x in blines)
            chr_lens[b] = maxlen

    for b in bed:
        accn = b.accn
        if accn == "centromere":
            centromeres[b.seqid] = b.start
        if accn in mappings:
            b.accn = mappings[accn]
        else:
            b.accn = "-"

    chr_number = len(chr_lens)
    if centromeres:
        assert chr_number == len(
            centromeres), "chr_number = {}, centromeres = {}".format(
                chr_number, centromeres)

    r = 0.7  # width and height of the whole chromosome set
    xstart, ystart = 0.15, 0.85
    xinterval = r / chr_number
    xwidth = xinterval * 0.5  # chromosome width
    max_chr_len = max(chr_lens.values())
    ratio = r / max_chr_len  # canvas / base

    # first the chromosomes
    for a, (chr, clen) in enumerate(sorted(chr_lens.items())):
        xx = xstart + a * xinterval + 0.5 * xwidth
        root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center")
        if centromeres:
            yy = ystart - centromeres[chr] * ratio
            ChromosomeWithCentromere(root,
                                     xx,
                                     ystart,
                                     yy,
                                     ystart - clen * ratio,
                                     width=xwidth)
        else:
            Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth)

    chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys())))

    alpha = 1
    # color the regions
    for chr in sorted(chr_lens.keys()):
        segment_size, excess = 0, 0
        bac_list = []
        prev_end, prev_klass = 0, None
        for b in bed.sub_bed(chr):
            clen = chr_lens[chr]
            idx = chr_idxs[chr]
            klass = b.accn
            if klass == "centromere":
                continue
            start = b.start
            end = b.end
            if start < prev_end + mergedist and klass == prev_klass:
                start = prev_end
            xx = xstart + idx * xinterval
            yystart = ystart - end * ratio
            yyend = ystart - start * ratio
            root.add_patch(
                Rectangle(
                    (xx, yystart),
                    xwidth,
                    yyend - yystart,
                    fc=class_colors.get(klass, "lightslategray"),
                    lw=0,
                    alpha=alpha,
                ))
            prev_end, prev_klass = b.end, klass

            if imagemap:
                """
                `segment` : size of current BAC being investigated + `excess`
                `excess`  : left-over bases from the previous BAC, as a result of
                            iterating over `winsize` regions of `segment`
                """
                if excess == 0:
                    segment_start = start
                segment = (end - start + 1) + excess
                while True:
                    if segment < winsize:
                        bac_list.append(b.accn)
                        excess = segment
                        break
                    segment_end = segment_start + winsize - 1
                    tlx, tly, brx, bry = (
                        xx,
                        (1 - ystart) + segment_start * ratio,
                        xx + xwidth,
                        (1 - ystart) + segment_end * ratio,
                    )
                    print(
                        "\t" + write_ImageMapLine(
                            tlx,
                            tly,
                            brx,
                            bry,
                            iopts.w,
                            iopts.h,
                            iopts.dpi,
                            chr + ":" + ",".join(bac_list),
                            segment_start,
                            segment_end,
                        ),
                        file=mapfh,
                    )

                    segment_start += winsize
                    segment -= winsize
                    bac_list = []

        if imagemap and excess > 0:
            bac_list.append(b.accn)
            segment_end = end
            tlx, tly, brx, bry = (
                xx,
                (1 - ystart) + segment_start * ratio,
                xx + xwidth,
                (1 - ystart) + segment_end * ratio,
            )
            print(
                "\t" + write_ImageMapLine(
                    tlx,
                    tly,
                    brx,
                    bry,
                    iopts.w,
                    iopts.h,
                    iopts.dpi,
                    chr + ":" + ",".join(bac_list),
                    segment_start,
                    segment_end,
                ),
                file=mapfh,
            )

    if imagemap:
        print("</map>", file=mapfh)
        mapfh.close()
        logging.debug("Image map written to `{0}`".format(mapfh.name))

    if gauge:
        xstart, ystart = 0.9, 0.85
        Gauge(root, xstart, ystart - r, ystart, max_chr_len)

    if "centromere" in class_colors:
        del class_colors["centromere"]

    # class legends, four in a row
    if legend:
        xstart = 0.1
        xinterval = 0.8 / len(class_colors)
        xwidth = 0.04
        yy = 0.08
        for klass, cc in sorted(class_colors.items()):
            if klass == "-":
                continue
            root.add_patch(
                Rectangle((xstart, yy),
                          xwidth,
                          xwidth,
                          fc=cc,
                          lw=0,
                          alpha=alpha))
            root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10)
            xstart += xinterval

    if empty:
        root.add_patch(
            Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1))
        root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10)

    if title:
        root.text(0.5, 0.95, markup(title), ha="center", va="center")