Exemple #1
0
def draw_gauge(ax, margin, maxl, rightmargin=None):
    # Draw a gauge on the top of the canvas
    rightmargin = rightmargin or margin
    ax.plot([margin, 1 - rightmargin], [1 - margin, 1 - margin], "k-", lw=2)

    best_stride = autoscale(maxl)
    nintervals = maxl * 1. / best_stride

    xx, yy = margin, 1 - margin
    tip = .005
    xinterval = (1 - margin - rightmargin) / nintervals
    l = human_size(best_stride)
    if l[-1] == 'b':
        suffix = target = l[-2:]

    for i in xrange(0, maxl + 1, best_stride):
        l = human_size(i, precision=0, target=target)
        if l[-1] == 'b':
            l, suffix = l[:-2], l[-2:]
        ax.plot([xx, xx], [yy, yy + tip], "k-", lw=2)
        ax.text(xx, yy + 2 * tip, l, ha="center", size=13)
        xx += xinterval

    xx += 4 * tip - xinterval
    ax.text(xx + tip, yy + 2 * tip, suffix)

    return best_stride / xinterval
Exemple #2
0
def draw_gauge(ax, margin, maxl, rightmargin=None):
    # Draw a gauge on the top of the canvas
    rightmargin = rightmargin or margin
    ax.plot([margin, 1 - rightmargin], [1 - margin, 1 - margin], "k-", lw=2)

    best_stride = autoscale(maxl)
    nintervals = maxl * 1.0 / best_stride

    xx, yy = margin, 1 - margin
    tip = 0.005
    xinterval = (1 - margin - rightmargin) / nintervals
    l = human_size(best_stride)
    if l[-1] == "b":
        suffix = target = l[-2:]

    for i in range(0, maxl + 1, best_stride):
        l = human_size(i, precision=0, target=target)
        if l[-1] == "b":
            l, suffix = l[:-2], l[-2:]
        ax.plot([xx, xx], [yy, yy + tip], "k-", lw=2)
        ax.text(xx, yy + 2 * tip, l, ha="center", size=13)
        xx += xinterval

    xx += 4 * tip - xinterval
    ax.text(xx + tip, yy + 2 * tip, suffix)

    return best_stride / xinterval
Exemple #3
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Exemple #4
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Exemple #5
0
    def rstats(self, object, bacs, components, scaffold_sizes, length):
        from jcvi.utils.cbook import human_size

        nbacs = len(bacs)
        nscaffolds = len(scaffold_sizes)
        a50, l50, n50 = calculate_A50(scaffold_sizes)
        l50 = human_size(l50)
        length = human_size(length)

        return (object, nbacs, components, nscaffolds, n50, l50, length)
Exemple #6
0
    def rstats(self, object, bacs, components, scaffold_sizes, length):
        from jcvi.utils.cbook import human_size

        nbacs = len(bacs)
        nscaffolds = len(scaffold_sizes)
        a50, l50, n50 = calculate_A50(scaffold_sizes)
        l50 = human_size(l50)
        length = human_size(length)

        return (object, nbacs, components, nscaffolds, n50, l50, length)
Exemple #7
0
def lineplot(ax, binfiles, nbins, chr, window, shift, color="br"):
    assert len(binfiles) <= 2, "A max of two line plots are supported"

    t = np.arange(nbins)
    bf = binfiles[0]
    m = linearray(bf, chr, window, shift)
    ax.plot(t, m, "{0}-".format(color[0]), lw=2)

    formatter = ticker.FuncFormatter(
        lambda x, pos: human_readable_base(int(x) * shift, pos))
    ax.xaxis.set_major_formatter(formatter)
    for tl in ax.get_xticklabels():
        tl.set_color("darkslategray")

    label = bf.filename.split(".")[0]
    perw = "per {0}".format(human_size(window, precision=0))
    ax.set_ylabel(label + " " + perw, color=color[0])

    if len(binfiles) == 2:
        ax2 = ax.twinx()
        bf = binfiles[1]
        m = linearray(bf, chr, window, shift)
        ax2.plot(t, m, "{0}-".format(color[1]), lw=2)
        # Differentiate tick labels through colors
        for tl in ax.get_yticklabels():
            tl.set_color(color[0])
        for tl in ax2.get_yticklabels():
            tl.set_color(color[1])

        label = bf.filename.split(".")[0]
        ax2.set_ylabel(label + " " + perw, color=color[1])

    ax.set_xlim(0, nbins)
Exemple #8
0
def lineplot(ax, binfiles, nbins, chr, window, shift, color="br"):
    assert len(binfiles) <= 2, "A max of two line plots are supported"

    t = np.arange(nbins)
    bf = binfiles[0]
    m = linearray(bf, chr, window, shift)
    ax.plot(t, m, "{0}-".format(color[0]), lw=2)

    formatter = ticker.FuncFormatter(lambda x, pos: \
                    human_readable_base(int(x) * shift, pos))
    ax.xaxis.set_major_formatter(formatter)
    for tl in ax.get_xticklabels():
        tl.set_color('darkslategray')

    label = bf.filename.split(".")[0]
    perw = "per {0}".format(human_size(window, precision=0))
    ax.set_ylabel(label + " " + perw, color=color[0])

    if len(binfiles) == 2:
        ax2 = ax.twinx()
        bf = binfiles[1]
        m = linearray(bf, chr, window, shift)
        ax2.plot(t, m, "{0}-".format(color[1]), lw=2)
        # Differentiate tick labels through colors
        for tl in ax.get_yticklabels():
            tl.set_color(color[0])
        for tl in ax2.get_yticklabels():
            tl.set_color(color[1])

        label = bf.filename.split(".")[0]
        ax2.set_ylabel(label + " " + perw, color=color[1])

    ax.set_xlim(0, nbins)
Exemple #9
0
def velvet(args):
    """
    %prog velvet readsize genomesize numreads K

    Calculate velvet memory requirement.
    <http://seqanswers.com/forums/showthread.php?t=2101>

    Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize +
    233353*NumReads - 51092*K

    Read size is in bases.
    Genome size is in millions of bases (Mb)
    Number of reads is in millions
    K is the kmer hash value used in velveth
    """
    p = OptionParser(velvet.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    readsize, genomesize, numreads, K = [int(x) for x in args]
    ram = -109635 + 18977 * readsize + 86326 * genomesize + \
            233353 * numreads - 51092 * K
    print >> sys.stderr, "ReadSize: {0}".format(readsize)
    print >> sys.stderr, "GenomeSize: {0}Mb".format(genomesize)
    print >> sys.stderr, "NumReads: {0}M".format(numreads)
    print >> sys.stderr, "K: {0}".format(K)

    ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True)
    print >> sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
Exemple #10
0
def size(args):
    """
    find folder -type l | %prog size

    Get the size for all the paths that are pointed by the links
    """
    from jcvi.utils.cbook import human_size

    p = OptionParser(size.__doc__)
    fp = sys.stdin

    results = []
    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue

        source = get_abs_path(link_name)

        link_name = op.basename(link_name)
        filesize = op.getsize(source)
        results.append((filesize, link_name))

    # sort by descending file size
    for filesize, link_name in sorted(results, reverse=True):
        filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True)
        print("%10s\t%s" % (filesize, link_name), file=sys.stderr)
Exemple #11
0
def size(args):
    """
    find folder -type l | %prog size

    Get the size for all the paths that are pointed by the links
    """
    from jcvi.utils.cbook import human_size

    p = OptionParser(size.__doc__)
    fp = sys.stdin

    results = []
    for link_name in fp:
        link_name = link_name.strip()
        if not op.islink(link_name):
            continue

        source = get_abs_path(link_name)

        link_name = op.basename(link_name)
        filesize = op.getsize(source)
        results.append((filesize, link_name))

    # sort by descending file size
    for filesize, link_name in sorted(results, reverse=True):
        filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True)
        print >>sys.stderr, "%10s\t%s" % (filesize, link_name)
Exemple #12
0
def velvet(args):
    """
    %prog velvet readsize genomesize numreads K

    Calculate velvet memory requirement.
    <http://seqanswers.com/forums/showthread.php?t=2101>

    Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize +
    233353*NumReads - 51092*K

    Read size is in bases.
    Genome size is in millions of bases (Mb)
    Number of reads is in millions
    K is the kmer hash value used in velveth
    """
    p = OptionParser(velvet.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    readsize, genomesize, numreads, K = [int(x) for x in args]
    ram = -109635 + 18977 * readsize + 86326 * genomesize + \
            233353 * numreads - 51092 * K
    print >> sys.stderr, "ReadSize: {0}".format(readsize)
    print >> sys.stderr, "GenomeSize: {0}Mb".format(genomesize)
    print >> sys.stderr, "NumReads: {0}M".format(numreads)
    print >> sys.stderr, "K: {0}".format(K)

    ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True)
    print >> sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
Exemple #13
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage", default=40, type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix", default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist", default=False, action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Exemple #14
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
Exemple #15
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
Exemple #16
0
def plot_heatmap(ax, M, breaks, iopts, binsize=BINSIZE):
    ax.imshow(M, cmap=iopts.cmap, interpolation='none')
    xlim = ax.get_xlim()
    for b in breaks[:-1]:
        ax.plot([b, b], xlim, 'w-')
        ax.plot(xlim, [b, b], 'w-')
    ax.set_xlim(xlim)
    ax.set_ylim((xlim[1], xlim[0]))  # Flip the y-axis so the origin is at the top
    ax.set_xticklabels([int(x) for x in ax.get_xticks()],
                       family='Helvetica', color="gray")
    ax.set_yticklabels([int(x) for x in ax.get_yticks()],
                       family='Helvetica', color="gray")
    binlabel = "Bins ({} per bin)".format(human_size(binsize, precision=0))
    ax.set_xlabel(binlabel)
Exemple #17
0
def plot_heatmap(ax, M, breaks, iopts, binsize=BINSIZE):
    ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none')
    xlim = ax.get_xlim()
    for b in breaks[:-1]:
        ax.plot([b, b], xlim, 'w-')
        ax.plot(xlim, [b, b], 'w-')
    ax.set_xlim(xlim)
    ax.set_ylim(xlim)
    ax.set_xticklabels([int(x) for x in ax.get_xticks()],
                       family='Helvetica', color="gray")
    ax.set_yticklabels([int(x) for x in ax.get_yticks()],
                       family='Helvetica', color="gray")
    binlabel = "Bins ({} per bin)".format(human_size(binsize, precision=0))
    ax.set_xlabel(binlabel)
Exemple #18
0
    def __init__(self, filename, human=False):
        super(FastQCdata, self).__init__(filename)
        if not op.exists(filename):
            logging.debug("File `{0}` not found.".format(filename))
            # Sample_RF37-1/RF37-1_GATCAG_L008_R2_fastqc =>
            # RF37-1_GATCAG_L008_R2
            self["Filename"] = op.basename(\
                    op.split(filename)[0]).rsplit("_", 1)[0]
            self["Total Sequences"] = self["Sequence length"] = \
                self["Total Bases"] = "na"
            return

        fp = open(filename)
        for row in fp:
            atoms = row.rstrip().split("\t")
            if atoms[0] in ("#", ">"):
                continue
            if len(atoms) != 2:
                continue

            a, b = atoms
            self[a] = b

        ts = self["Total Sequences"]
        sl = self["Sequence length"]
        if "-" in sl:
            a, b = sl.split("-")
            sl = (int(a) + int(b)) / 2
            if a == "30":
                sl = int(b)

        ts, sl = int(ts), int(sl)
        tb = ts * sl

        self["Total Sequences"] = human_size(ts).rstrip("b") if human else ts
        self["Total Bases"] = human_size(tb).rstrip("b") if human else tb
Exemple #19
0
    def __init__(self, filename, human=False):
        super(FastQCdata, self).__init__(filename)
        if not op.exists(filename):
            logging.debug("File `{0}` not found.".format(filename))
            # Sample_RF37-1/RF37-1_GATCAG_L008_R2_fastqc =>
            # RF37-1_GATCAG_L008_R2
            self["Filename"] = op.basename(\
                    op.split(filename)[0]).rsplit("_", 1)[0]
            self["Total Sequences"] = self["Sequence length"] = \
                self["Total Bases"] = "na"
            return

        fp = open(filename)
        for row in fp:
            atoms = row.rstrip().split("\t")
            if atoms[0] in ("#", ">"):
                continue
            if len(atoms) != 2:
                continue

            a, b = atoms
            self[a] = b

        ts = self["Total Sequences"]
        sl = self["Sequence length"]
        if "-" in sl:
            a, b = sl.split("-")
            sl = (int(a) + int(b)) / 2
            if a == "30":
                sl = int(b)

        ts, sl = int(ts), int(sl)
        tb = ts * sl

        self["Total Sequences"] = human_size(ts).rstrip("b") if human else ts
        self["Total Bases"] = human_size(tb).rstrip("b") if human else tb
Exemple #20
0
def plot_heatmap(ax, M, breaks, iopts):
    ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none')
    xlim = ax.get_xlim()
    for b in breaks[:-1]:
        ax.plot([b, b], xlim, 'w-')
        ax.plot(xlim, [b, b], 'w-')
    ax.set_xlim(xlim)
    ax.set_ylim(xlim)
    ax.set_xticklabels([int(x) for x in ax.get_xticks()],
                       family='Helvetica',
                       color="gray")
    ax.set_yticklabels([int(x) for x in ax.get_yticks()],
                       family='Helvetica',
                       color="gray")
    binlabel = "Bins ({} per bin)".format(human_size(BINSIZE, precision=0))
    ax.set_xlabel(binlabel)
    ax.set_ylabel(binlabel)
Exemple #21
0
def velvet(readsize, genomesize, numreads, K):
    """
    Calculate velvet memory requirement.
    <http://seqanswers.com/forums/showthread.php?t=2101>

    Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize +
    233353*NumReads - 51092*K

    Read size is in bases.
    Genome size is in millions of bases (Mb)
    Number of reads is in millions
    K is the kmer hash value used in velveth
    """
    ram = -109635 + 18977 * readsize + 86326 * genomesize + 233353 * numreads - 51092 * K
    print >>sys.stderr, "ReadSize: {0}".format(readsize)
    print >>sys.stderr, "GenomeSize: {0}Mb".format(genomesize)
    print >>sys.stderr, "NumReads: {0}M".format(numreads)
    print >>sys.stderr, "K: {0}".format(K)

    ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True)
    print >>sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
Exemple #22
0
def velvet(readsize, genomesize, numreads, K):
    """
    Calculate velvet memory requirement.
    <http://seqanswers.com/forums/showthread.php?t=2101>

    Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize +
    233353*NumReads - 51092*K

    Read size is in bases.
    Genome size is in millions of bases (Mb)
    Number of reads is in millions
    K is the kmer hash value used in velveth
    """
    ram = -109635 + 18977 * readsize + 86326 * genomesize + \
            233353 * numreads - 51092 * K
    print >> sys.stderr, "ReadSize: {0}".format(readsize)
    print >> sys.stderr, "GenomeSize: {0}Mb".format(genomesize)
    print >> sys.stderr, "NumReads: {0}M".format(numreads)
    print >> sys.stderr, "K: {0}".format(K)

    ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True)
    print >> sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
Exemple #23
0
def plot(args):
    """
    %prog plot input.bed seqid

    Plot the matchings between the reconstructed pseudomolecules and the maps.
    Two types of visualizations are available in one canvas:

    1. Parallel axes, and matching markers are shown in connecting lines;
    2. Scatter plot.
    """
    from jcvi.graphics.base import plt, savefig, normalize_axes, \
                set2, panel_labels
    from jcvi.graphics.chromosome import Chromosome, GeneticMap, \
                HorizontalChromosome

    p = OptionParser(plot.__doc__)
    add_allmaps_plot_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="10x6")

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, seqid = args
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".lifted.bed"
    agpfile = pf + ".agp"
    weightsfile = opts.weightsfile
    links = opts.links

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    allseqids = cc.seqids
    mapnames = cc.mapnames
    weights = Weights(weightsfile, mapnames)
    assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids)

    s = Scaffold(seqid, cc)
    mlgs = [k for k, v in s.mlg_counts.items() if v >= links]
    while not mlgs:
        links /= 2
        logging.error("No markers to plot, --links reset to {0}".format(links))
        mlgs = [k for k, v in s.mlg_counts.items() if v >= links]

    mlgsizes = {}
    for mlg in mlgs:
        mm = cc.extract_mlg(mlg)
        mlgsize = max(function(x) for x in mm)
        mlgsizes[mlg] = mlgsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax1 = fig.add_axes([0, 0, .5, 1])
    ax2 = fig.add_axes([.5, 0, .5, 1])

    # Find the layout first
    ystart, ystop = .9, .1
    L = Layout(mlgsizes)
    coords = L.coords

    tip = .02
    marker_pos = {}
    # Palette
    colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames))
    colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs)

    rhos = {}
    # Parallel coordinates
    for mlg, (x, y1, y2) in coords.items():
        mm = cc.extract_mlg(mlg)
        markers = [(m.accn, function(m)) for m in mm]  # exhaustive marker list
        xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid]
        mx, my = zip(*xy)
        rho = spearmanr(mx, my)
        rhos[mlg] = rho
        flip = rho < 0

        g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip)
        extra = -3 * tip if x < .5 else 3 * tip
        ha = "right" if x < .5 else "left"
        mapname = mlg.split("-")[0]
        tlg = mlg.replace("_", ".")  # Latex does not like underscore char
        label = "{0} (w={1})".format(tlg, weights[mapname])
        ax1.text(x + extra, (y1 + y2) / 2, label, color=colors[mlg],
                 ha=ha, va="center", rotation=90)
        marker_pos.update(g.marker_pos)

    agp = AGP(agpfile)
    agp = [x for x in agp if x.object == seqid]
    chrsize = max(x.object_end for x in agp)

    # Pseudomolecules in the center
    r = ystart - ystop
    ratio = r / chrsize
    f = lambda x: (ystart - ratio * x)
    patchstart = [f(x.object_beg) for x in agp if not x.is_gap]
    Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2)

    label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0))
    ax1.text(.5, ystart + tip, label, ha="center")

    scatter_data = defaultdict(list)
    # Connecting lines
    for b in s.markers:
        marker_name = b.accn
        if marker_name not in marker_pos:
            continue

        cx = .5
        cy = f(b.pos)
        mx = coords[b.mlg][0]
        my = marker_pos[marker_name]

        extra = -tip if mx < cx else tip
        extra *= 1.25  # leave boundaries for aesthetic reasons
        cx += extra
        mx -= extra
        ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg])
        scatter_data[b.mlg].append((b.pos, function(b)))

    # Scatter plot, same data as parallel coordinates
    xstart, xstop = sorted((ystart, ystop))
    f = lambda x: (xstart + ratio * x)
    pp = [x.object_beg for x in agp if not x.is_gap]
    patchstart = [f(x) for x in pp]
    HorizontalChromosome(ax2, xstart, xstop, ystop,
                         height=2 * tip, patch=patchstart, lw=2)

    gap = .03
    ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values())

    tlgs = []
    for mlg, mlgsize in sorted(mlgsizes.items()):
        height = ratio * mlgsize
        ystart -= height
        xx = .5 + xstart / 2
        width = r / 2
        color = colors[mlg]
        ax = fig.add_axes([xx, ystart, width, height])
        ypos = ystart + height / 2
        ystart -= gap
        sd = scatter_data[mlg]
        xx, yy = zip(*sd)
        ax.vlines(pp, 0, mlgsize, colors="beige")
        ax.plot(xx, yy, ".", color=color)
        rho = rhos[mlg]
        ax.text(.5, 1 - .4 * gap / height, r"$\rho$={0:.3f}".format(rho),
                    ha="center", va="top", transform=ax.transAxes, color="gray")
        tlg = mlg.replace("_", ".")
        tlgs.append((tlg, ypos, color))
        ax.set_xlim(0, chrsize)
        ax.set_ylim(0, mlgsize)
        ax.set_xticks([])
        while height / len(ax.get_yticks()) < .03 and len(ax.get_yticks()) >= 2:
            ax.set_yticks(ax.get_yticks()[::2])  # Sparsify the ticks
        yticklabels = [int(x) for x in ax.get_yticks()]
        ax.set_yticklabels(yticklabels, family='Helvetica')
        if rho < 0:
            ax.invert_yaxis()

    for i, (tlg, ypos, color) in enumerate(tlgs):
        ha = "center"
        if len(tlgs) > 4:
            ha = "right" if i % 2 else "left"
        root.text(.5, ypos, tlg, color=color, rotation=90,
                      ha=ha, va="center")

    if opts.panels:
        labels = ((.04, .96, 'A'), (.48, .96, 'B'))
        panel_labels(root, labels)

    normalize_axes((ax1, ax2, root))
    image_name = seqid + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    plt.close(fig)
Exemple #24
0
    def __init__(self,
                 fig,
                 root,
                 datafile,
                 bedfile,
                 layoutfile,
                 switch=None,
                 tree=None,
                 extra_features=None,
                 chr_label=True,
                 loc_label=True,
                 pad=.04,
                 scalebar=False):

        w, h = fig.get_figwidth(), fig.get_figheight()
        bed = Bed(bedfile)
        order = bed.order
        bf = BlockFile(datafile)
        self.layout = lo = Layout(layoutfile)
        switch = DictFile(switch, delimiter="\t") if switch else None
        if extra_features:
            extra_features = Bed(extra_features)

        exts = []
        extras = []
        for i in xrange(bf.ncols):
            ext = bf.get_extent(i, order)
            exts.append(ext)
            if extra_features:
                start, end, si, ei, chr, orientation, span = ext
                start, end = start.start, end.end  # start, end coordinates
                ef = list(extra_features.extract(chr, start, end))

                # Pruning removes minor features with < 0.1% of the region
                ef_pruned = [x for x in ef if x.span >= span / 1000]
                print >> sys.stderr, "Extracted {0} features "\
                        "({1} after pruning)".format(len(ef), len(ef_pruned))
                extras.append(ef_pruned)

        maxspan = max(exts, key=lambda x: x[-1])[-1]
        scale = maxspan / .65

        self.gg = gg = {}
        self.rr = []
        ymids = []
        vpad = .012 * w / h
        for i in xrange(bf.ncols):
            ext = exts[i]
            ef = extras[i] if extras else None
            r = Region(root,
                       ext,
                       lo[i],
                       bed,
                       scale,
                       switch,
                       chr_label=chr_label,
                       loc_label=loc_label,
                       vpad=vpad,
                       extra_features=ef)
            self.rr.append(r)
            # Use tid and accn to store gene positions
            gg.update(dict(((i, k), v) for k, v in r.gg.items()))
            ymids.append(r.y)

        for i, j in lo.edges:
            for ga, gb, h in bf.iter_pairs(i, j):
                a, b = gg[(i, ga)], gg[(j, gb)]
                ymid = (ymids[i] + ymids[j]) / 2
                Shade(root, a, b, ymid, fc="gainsboro", lw=0, alpha=1)

            for ga, gb, h in bf.iter_pairs(i, j, highlight=True):
                a, b = gg[(i, ga)], gg[(j, gb)]
                ymid = (ymids[i] + ymids[j]) / 2
                Shade(root, a, b, ymid, alpha=1, highlight=h, zorder=2)

        if scalebar:
            print >> sys.stderr, "Build scalebar (scale={})".format(scale)
            # Find the best length of the scalebar
            ar = [1, 2, 5]
            candidates = [1000 * x for x in ar] + [10000 * x for x in ar] + \
                         [100000 * x for x in ar]
            # Find the one that's close to an optimal canvas size
            dists = [(abs(x / scale - .12), x) for x in candidates]
            dist, candidate = min(dists)
            dist = candidate / scale
            x, y, yp = .2, .96, .005
            a, b = x - dist / 2, x + dist / 2
            lsg = "lightslategrey"
            root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([a, b], [y, y], "-", lw=2, color=lsg)
            root.text(x,
                      y + .02,
                      human_size(candidate, precision=0),
                      ha="center",
                      va="center")

        if tree:
            from jcvi.graphics.tree import draw_tree, read_trees

            trees = read_trees(tree)
            ntrees = len(trees)
            logging.debug("A total of {0} trees imported.".format(ntrees))
            xiv = 1. / ntrees
            yiv = .3
            xstart = 0
            ystart = min(ymids) - .4
            for i in xrange(ntrees):
                ax = fig.add_axes([xstart, ystart, xiv, yiv])
                label, outgroup, tx = trees[i]
                draw_tree(ax, tx, outgroup=outgroup, rmargin=.4, leaffont=11)
                xstart += xiv
                RoundLabel(ax,
                           .5,
                           .3,
                           label,
                           fill=True,
                           fc="lavender",
                           color="r")
Exemple #25
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import (
        plt,
        markup,
        human_formatter,
        human_base_formatter,
        savefig,
        set2,
        set_ticklabels_helvetica,
    )

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000,
                    bins=100,
                    xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1",
                 default="Counts",
                 help="Label of y-axis on the left")
    p.add_option(
        "--color",
        default="0",
        choices=[str(x) for x in range(8)],
        help="Color of bars, which is an index 0-7 in brewer set2",
    )
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * 0.5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000**SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1.0 / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, "-", color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = 0.95, 0.95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print(t, file=sys.stderr)
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= 0.05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(
        axis="x",
        direction="out",
        length=3,
        left=False,
        right=False,
        top=False,
        bottom=True,
    )
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Exemple #26
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import plt, markup, human_formatter, \
                human_base_formatter, savefig, set2, set_ticklabels_helvetica

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000, bins=100, xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1", default="Counts",
                 help="Label of y-axis on the left")
    p.add_option("--color", default='0', choices=[str(x) for x in range(8)],
                 help="Color of bars, which is an index 0-7 in brewer set2")
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * .5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000 ** SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1. / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, '-', color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = .95, .95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print >> sys.stderr, t
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= .05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(axis="x", direction="out", length=3,
                    left=False, right=False, top=False, bottom=True)
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Exemple #27
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None,
                 chr_label=True, pad=.04, vpad=.012, extra_features=None):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join((human_size(startbp, target="Mb")[:-2],
                          human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            self.gg[g.accn] = (a, b)

            color = forward if strand == "+" else backward
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False,
                           fc='#ff7f00', zorder=2)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad - .01
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5)
            ax.text(lx, ly + vpad, markup(chr), color=layout.color,
                        ha=ha, va="center", rotation=trans_angle,
                        bbox=bbox, zorder=10)
            ax.text(lx, ly - vpad, label, color="lightslategrey", size=10,
                        ha=ha, va="center", rotation=trans_angle,
                        bbox=bbox, zorder=10)
Exemple #28
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True,
                 pad=.04, vpad=.012):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join((human_size(startbp, target="Mb")[:-2],
                          human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2 = cv(gstart), cv(gend)
            a, b = tr.transform((x1, y)), tr.transform((x2, y))
            a, b = inv.transform(a), inv.transform(b)
            self.gg[g.accn] = (a, b)

            color = "b" if strand == "+" else "g"
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            ax.text(lx, ly + vpad, markup(chr), color=layout.color,
                        ha=ha, va="center", rotation=trans_angle)
            ax.text(lx, ly - vpad, label, color="k",
                        ha=ha, va="center", rotation=trans_angle)
Exemple #29
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size

    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size")
    p.add_option(
        "--coverage",
        default=40,
        type="int",
        help="Expected sequence coverage",
    )
    p.add_option("--prefix", default="jf", help="Database prefix")
    p.add_option(
        "--nohist",
        default=False,
        action="store_true",
        help="Do not print histogram",
    )
    p.set_home("jellyfish")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".format(
        human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    jfcmd = op.join(opts.jellyfish_home, "jellyfish")
    cmd = jfcmd
    cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Exemple #30
0
def plot(args):
    """
    %prog plot input.bed seqid

    Plot the matchings between the reconstructed pseudomolecules and the maps.
    Two types of visualizations are available in one canvas:

    1. Parallel axes, and matching markers are shown in connecting lines;
    2. Scatter plot.
    """
    from jcvi.graphics.base import plt, savefig, normalize_axes, \
                set2, panel_labels
    from jcvi.graphics.chromosome import Chromosome, GeneticMap, \
                HorizontalChromosome

    p = OptionParser(plot.__doc__)
    add_allmaps_plot_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="10x6")

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, seqid = args
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".lifted.bed"
    agpfile = pf + ".agp"
    weightsfile = opts.weightsfile
    links = opts.links

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    allseqids = cc.seqids
    mapnames = cc.mapnames
    weights = Weights(weightsfile, mapnames)
    assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids)

    s = Scaffold(seqid, cc)
    mlgs = [k for k, v in s.mlg_counts.items() if v >= links]
    mlgsizes = {}
    for mlg in mlgs:
        mm = cc.extract_mlg(mlg)
        mlgsize = max(function(x) for x in mm)
        mlgsizes[mlg] = mlgsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax1 = fig.add_axes([0, 0, .5, 1])
    ax2 = fig.add_axes([.5, 0, .5, 1])

    # Find the layout first
    ystart, ystop = .9, .1
    L = Layout(mlgsizes)
    coords = L.coords

    tip = .02
    marker_pos = {}
    # Palette
    colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames))
    colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs)

    rhos = {}
    # Parallel coordinates
    for mlg, (x, y1, y2) in coords.items():
        mm = cc.extract_mlg(mlg)
        markers = [(m.accn, function(m)) for m in mm]  # exhaustive marker list
        xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid]
        mx, my = zip(*xy)
        rho = spearmanr(mx, my)
        rhos[mlg] = rho
        flip = rho < 0

        g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip)
        extra = -3 * tip if x < .5 else 3 * tip
        ha = "right" if x < .5 else "left"
        mapname = mlg.split("-")[0]
        tlg = mlg.replace("_", ".")  # Latex does not like underscore char
        label = "{0} (w={1})".format(tlg, weights[mapname])
        ax1.text(x + extra, (y1 + y2) / 2,
                 label,
                 color=colors[mlg],
                 ha=ha,
                 va="center",
                 rotation=90)
        marker_pos.update(g.marker_pos)

    agp = AGP(agpfile)
    agp = [x for x in agp if x.object == seqid]
    chrsize = max(x.object_end for x in agp)

    # Pseudomolecules in the center
    r = ystart - ystop
    ratio = r / chrsize
    f = lambda x: (ystart - ratio * x)
    patchstart = [f(x.object_beg) for x in agp if not x.is_gap]
    Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2)

    label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0))
    ax1.text(.5, ystart + tip, label, ha="center")

    scatter_data = defaultdict(list)
    # Connecting lines
    for b in s.markers:
        marker_name = b.accn
        if marker_name not in marker_pos:
            continue

        cx = .5
        cy = f(b.pos)
        mx = coords[b.mlg][0]
        my = marker_pos[marker_name]

        extra = -tip if mx < cx else tip
        extra *= 1.25  # leave boundaries for aesthetic reasons
        cx += extra
        mx -= extra
        ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg])
        scatter_data[b.mlg].append((b.pos, function(b)))

    # Scatter plot, same data as parallel coordinates
    xstart, xstop = sorted((ystart, ystop))
    f = lambda x: (xstart + ratio * x)
    pp = [x.object_beg for x in agp if not x.is_gap]
    patchstart = [f(x) for x in pp]
    HorizontalChromosome(ax2,
                         xstart,
                         xstop,
                         ystop,
                         height=2 * tip,
                         patch=patchstart,
                         lw=2)

    gap = .03
    ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values())

    tlgs = []
    for mlg, mlgsize in sorted(mlgsizes.items()):
        height = ratio * mlgsize
        ystart -= height
        xx = .5 + xstart / 2
        width = r / 2
        color = colors[mlg]
        ax = fig.add_axes([xx, ystart, width, height])
        ypos = ystart + height / 2
        ystart -= gap
        sd = scatter_data[mlg]
        xx, yy = zip(*sd)
        ax.vlines(pp, 0, mlgsize, colors="beige")
        ax.plot(xx, yy, ".", color=color)
        rho = rhos[mlg]
        ax.text(.5,
                1 - .4 * gap / height,
                r"$\rho$={0:.3f}".format(rho),
                ha="center",
                va="top",
                transform=ax.transAxes,
                color="gray")
        tlg = mlg.replace("_", ".")
        tlgs.append((tlg, ypos, color))
        ax.set_xlim(0, chrsize)
        ax.set_ylim(0, mlgsize)
        ax.set_xticks([])
        while height / len(ax.get_yticks()) < .03 and len(
                ax.get_yticks()) >= 2:
            ax.set_yticks(ax.get_yticks()[::2])  # Sparsify the ticks
        yticklabels = [int(x) for x in ax.get_yticks()]
        ax.set_yticklabels(yticklabels, family='Helvetica')
        if rho < 0:
            ax.invert_yaxis()

    for i, (tlg, ypos, color) in enumerate(tlgs):
        ha = "center"
        if len(tlgs) > 4:
            ha = "right" if i % 2 else "left"
        root.text(.5, ypos, tlg, color=color, rotation=90, ha=ha, va="center")

    if opts.panels:
        labels = ((.04, .96, 'A'), (.48, .96, 'B'))
        panel_labels(root, labels)

    normalize_axes((ax1, ax2, root))
    image_name = seqid + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    plt.close(fig)
Exemple #31
0
    def __init__(
        self,
        ax,
        ext,
        layout,
        bed,
        scale,
        switch=None,
        chr_label=True,
        loc_label=True,
        genelabelsize=0,
        pad=0.05,
        vpad=0.015,
        extra_features=None,
        glyphstyle="box",
        glyphcolor: BasePalette = OrientationPalette(),
    ):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, lw=2, zorder=1)

        self.genes = genes = bed[si : ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == "-":
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        if layout.label:
            chr = layout.label

        label = "-".join(
            (
                human_size(startbp, target="Mb", precision=2)[:-2],
                human_size(endbp, target="Mb", precision=2),
            )
        )

        height = 0.012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == "-":
                gstart, gend = gend, gstart
            if orientation == "-":
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            gene_name = g.accn
            self.gg[gene_name] = (a, b)

            color, zorder = (
                glyphcolor.get_color_and_zorder(strand)
                if isinstance(glyphcolor, OrientationPalette)
                else glyphcolor.get_color_and_zorder(gene_name)
            )

            if hidden:
                continue
            gp = Glyph(
                ax,
                x1,
                x2,
                y,
                height,
                gradient=False,
                fc=color,
                style=glyphstyle,
                zorder=zorder,
            )
            gp.set_transform(tr)
            if genelabelsize:
                ax.text(
                    (x1 + x2) / 2,
                    y + height / 2 + genelabelsize * vpad / 3,
                    markup(gene_name),
                    size=genelabelsize,
                    rotation=25,
                    ha="left",
                    va="center",
                    color="lightslategray",
                )

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(
                    ax,
                    x1,
                    x2,
                    y,
                    height * 3 / 4,
                    gradient=False,
                    fc="#ff7f00",
                    style=glyphstyle,
                    zorder=2,
                )
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = 0.02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.0
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr,)), l.reshape((1, 2)))[
            0
        ]
        lx, ly = l
        if not hidden:
            bbox = dict(boxstyle="round", fc="w", ec="w", alpha=0.5)
            kwargs = dict(
                ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10
            )

            # TODO: I spent several hours on trying to make this work - with no
            # good solutions. To generate labels on multiple lines, each line
            # with a different style is difficult in matplotlib. The only way,
            # if you can tolerate an extra dot (.), is to use the recipe below.
            # chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None
            # loc_label = r"\noindent . \\ " + label if loc_label else None

            chr_label = markup(chr) if chr_label else None
            loc_label = label if loc_label else None
            if chr_label:
                if loc_label:
                    ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs)
                    ax.text(
                        lx,
                        ly - vpad,
                        loc_label,
                        color="lightslategrey",
                        size=10,
                        **kwargs
                    )
                else:
                    ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
Exemple #32
0
def simple(args):
    """
    %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Write the block ends for each block in the anchorfile.
    GeneA1    GeneA2    GeneB1    GeneB2   +/-      score

    Optional additional columns:
    orderA1   orderA2   orderB1   orderB2  sizeA    sizeB   size    block_id

    With base coordinates (--coords):
    block_id  seqidA    startA    endA     bpSpanA  GeneA1   GeneA2  geneSpanA
    block_id  seqidB    startB    endB     bpSpanB  GeneB1   GeneB2  geneSpanB
    """
    p = OptionParser(simple.__doc__)
    p.add_option("--rich", default=False, action="store_true", \
                help="Output additional columns [default: %default]")
    p.add_option("--coords", default=False, action="store_true",
                help="Output columns with base coordinates [default: %default]")
    p.add_option("--bed", default=False, action="store_true",
                help="Generate BED file for the blocks")
    p.add_option("--noheader", default=False, action="store_true",
                help="Don't output header [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    additional = opts.rich
    coords = opts.coords
    header = not opts.noheader
    bed = opts.bed
    if bed:
        coords = True
        bbed = Bed()

    ac = AnchorFile(anchorfile)
    simplefile = anchorfile.rsplit(".", 1)[0] + ".simple"

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    pf = "-".join(anchorfile.split(".", 2)[:2])
    blocks = ac.blocks

    if coords:
        h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation"
    else:
        h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score"
        if additional:
            h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\
                  "SizeA|SizeB|Size|Block"

    fws = open(simplefile, "w")
    if header:
        print >> fws, "\t".join(h.split("|"))

    atotalbase = btotalbase = 0
    for i, block in enumerate(blocks):

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)

        astarti, aendi = min(ia), max(ia)
        bstarti, bendi = min(ib), max(ib)
        astart, aend = min(a)[1].accn, max(a)[1].accn
        bstart, bend = min(b)[1].accn, max(b)[1].accn

        sizeA = len(set(ia))
        sizeB = len(set(ib))
        size = len(block)

        slope, intercept = np.polyfit(ia, ib, 1)
        orientation = "+" if slope >= 0 else '-'
        aspan = aendi - astarti + 1
        bspan = bendi - bstarti + 1
        score = int((aspan * bspan) ** .5)
        score = str(score)
        block_id = pf + "-block-{0}".format(i)

        if coords:

            aseqid, astartbase, aendbase = \
                    get_boundary_bases(astart, aend, qorder)
            bseqid, bstartbase, bendbase = \
                    get_boundary_bases(bstart, bend, sorder)
            abase = aendbase - astartbase + 1
            bbase = bendbase - bstartbase + 1
            atotalbase += abase
            btotalbase += bbase

            # Write dual lines
            aargs = [block_id, aseqid, astartbase, aendbase,
                     abase, astart, aend, aspan, "+"]
            bargs = [block_id, bseqid, bstartbase, bendbase,
                     bbase, bstart, bend, bspan, orientation]

            if bed:
                bbed.append(BedLine("\t".join(str(x) for x in \
                           (bseqid, bstartbase - 1, bendbase,
                           "{}:{}-{}".format(aseqid, astartbase, aendbase),
                           size, orientation))))

            for args in (aargs, bargs):
                print >> fws, "\t".join(str(x) for x in args)
            continue

        args = [astart, aend, bstart, bend, score, orientation]
        if additional:
            args += [astarti, aendi, bstarti, bendi,
                     sizeA, sizeB, size, block_id]
        print >> fws, "\t".join(str(x) for x in args)

    fws.close()
    logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile))

    if coords:
        print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \
                        human_size(atotalbase, precision=2))
        print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \
                        human_size(btotalbase, precision=2))
        print >> sys.stderr, "Ratio: {0:.1f}x".format(\
                        max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase))

    if bed:
        bedfile = simplefile + ".bed"
        bbed.print_to_file(filename=bedfile, sorted=True)
        logging.debug("Bed file written to `{}`".format(bedfile))
Exemple #33
0
    def __init__(
        self,
        fig,
        root,
        datafile,
        bedfile,
        layoutfile,
        switch=None,
        tree=None,
        extra_features=None,
        chr_label=True,
        loc_label=True,
        genelabelsize=0,
        pad=0.05,
        vpad=0.015,
        scalebar=False,
        shadestyle="curve",
        glyphstyle="arrow",
        glyphcolor: BasePalette = OrientationPalette(),
    ):
        _, h = fig.get_figwidth(), fig.get_figheight()
        bed = Bed(bedfile)
        order = bed.order
        bf = BlockFile(datafile)
        self.layout = lo = Layout(layoutfile)
        switch = DictFile(switch, delimiter="\t") if switch else None
        if extra_features:
            extra_features = Bed(extra_features)

        exts = []
        extras = []
        for i in range(bf.ncols):
            ext = bf.get_extent(i, order)
            exts.append(ext)
            if extra_features:
                start, end, si, ei, chr, orientation, span = ext
                start, end = start.start, end.end  # start, end coordinates
                ef = list(extra_features.extract(chr, start, end))

                # Pruning removes minor features with < 0.1% of the region
                ef_pruned = [x for x in ef if x.span >= span / 1000]
                print(
                    "Extracted {0} features "
                    "({1} after pruning)".format(len(ef), len(ef_pruned)),
                    file=sys.stderr,
                )
                extras.append(ef_pruned)

        maxspan = max(exts, key=lambda x: x[-1])[-1]
        scale = maxspan / 0.65

        self.gg = gg = {}
        self.rr = []
        ymids = []
        glyphcolor = (
            OrientationPalette()
            if glyphcolor == "orientation"
            else OrthoGroupPalette(bf.grouper())
        )
        for i in range(bf.ncols):
            ext = exts[i]
            ef = extras[i] if extras else None
            r = Region(
                root,
                ext,
                lo[i],
                bed,
                scale,
                switch,
                genelabelsize=genelabelsize,
                chr_label=chr_label,
                loc_label=loc_label,
                vpad=vpad,
                extra_features=ef,
                glyphstyle=glyphstyle,
                glyphcolor=glyphcolor,
            )
            self.rr.append(r)
            # Use tid and accn to store gene positions
            gg.update(dict(((i, k), v) for k, v in r.gg.items()))
            ymids.append(r.y)

        def offset(samearc):
            if samearc == "above":
                return 2 * pad
            if samearc == "above2":
                return 4 * pad
            if samearc == "below":
                return -2 * pad
            if samearc == "below2":
                return -4 * pad

        for i, j, blockcolor, samearc in lo.edges:
            for ga, gb, h in bf.iter_pairs(i, j):
                a, b = gg[(i, ga)], gg[(j, gb)]
                if samearc is not None:
                    ymid = ymids[i] + offset(samearc)
                else:
                    ymid = (ymids[i] + ymids[j]) / 2
                Shade(root, a, b, ymid, fc=blockcolor, lw=0, alpha=1, style=shadestyle)

            for ga, gb, h in bf.iter_pairs(i, j, highlight=True):
                a, b = gg[(i, ga)], gg[(j, gb)]
                if samearc is not None:
                    ymid = ymids[i] + offset(samearc)
                else:
                    ymid = (ymids[i] + ymids[j]) / 2
                Shade(
                    root, a, b, ymid, alpha=1, highlight=h, zorder=2, style=shadestyle
                )

        if scalebar:
            print("Build scalebar (scale={})".format(scale), file=sys.stderr)
            # Find the best length of the scalebar
            ar = [1, 2, 5]
            candidates = (
                [1000 * x for x in ar]
                + [10000 * x for x in ar]
                + [100000 * x for x in ar]
            )
            # Find the one that's close to an optimal canvas size
            dists = [(abs(x / scale - 0.12), x) for x in candidates]
            dist, candidate = min(dists)
            dist = candidate / scale
            x, y, yp = 0.22, 0.92, 0.005
            a, b = x - dist / 2, x + dist / 2
            lsg = "lightslategrey"
            root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([a, b], [y, y], "-", lw=2, color=lsg)
            root.text(
                x,
                y + 0.02,
                human_size(candidate, precision=0),
                ha="center",
                va="center",
            )

        if tree:
            from jcvi.graphics.tree import draw_tree, read_trees

            trees = read_trees(tree)
            ntrees = len(trees)
            logging.debug("A total of {0} trees imported.".format(ntrees))
            xiv = 1.0 / ntrees
            yiv = 0.3
            xstart = 0
            ystart = min(ymids) - 0.4
            for i in range(ntrees):
                ax = fig.add_axes([xstart, ystart, xiv, yiv])
                label, outgroup, color, tx = trees[i]
                draw_tree(
                    ax,
                    tx,
                    outgroup=outgroup,
                    rmargin=0.4,
                    leaffont=11,
                    treecolor=color,
                    supportcolor=color,
                    leafcolor=color,
                )
                xstart += xiv
                RoundLabel(ax, 0.5, 0.3, label, fill=True, fc="lavender", color=color)
Exemple #34
0
    def __init__(self, fig, root, datafile, bedfile, layoutfile,
                 switch=None, tree=None, extra_features=None,
                 chr_label=True, loc_label=True, pad=.05, vpad=.015,
                 scalebar=False):

        w, h = fig.get_figwidth(), fig.get_figheight()
        bed = Bed(bedfile)
        order = bed.order
        bf = BlockFile(datafile)
        self.layout = lo = Layout(layoutfile)
        switch = DictFile(switch, delimiter="\t") if switch else None
        if extra_features:
            extra_features = Bed(extra_features)

        exts = []
        extras = []
        for i in xrange(bf.ncols):
            ext = bf.get_extent(i, order)
            exts.append(ext)
            if extra_features:
                start, end, si, ei, chr, orientation, span = ext
                start, end = start.start, end.end  # start, end coordinates
                ef = list(extra_features.extract(chr, start, end))

                # Pruning removes minor features with < 0.1% of the region
                ef_pruned = [x for x in ef if x.span >= span / 1000]
                print >> sys.stderr, "Extracted {0} features "\
                        "({1} after pruning)".format(len(ef), len(ef_pruned))
                extras.append(ef_pruned)

        maxspan = max(exts, key=lambda x: x[-1])[-1]
        scale = maxspan / .65

        self.gg = gg = {}
        self.rr = []
        ymids = []
        #vpad = .012 * w / h
        for i in xrange(bf.ncols):
            ext = exts[i]
            ef = extras[i] if extras else None
            r = Region(root, ext, lo[i], bed, scale, switch,
                       chr_label=chr_label, loc_label=loc_label,
                       vpad=vpad, extra_features=ef)
            self.rr.append(r)
            # Use tid and accn to store gene positions
            gg.update(dict(((i, k), v) for k, v in r.gg.items()))
            ymids.append(r.y)

        for i, j in lo.edges:
            for ga, gb, h in bf.iter_pairs(i, j):
                a, b = gg[(i, ga)], gg[(j, gb)]
                ymid = (ymids[i] + ymids[j]) / 2
                Shade(root, a, b, ymid, fc="gainsboro", lw=0, alpha=1)

            for ga, gb, h in bf.iter_pairs(i, j, highlight=True):
                a, b = gg[(i, ga)], gg[(j, gb)]
                ymid = (ymids[i] + ymids[j]) / 2
                Shade(root, a, b, ymid, alpha=1, highlight=h, zorder=2)

        if scalebar:
            print >> sys.stderr, "Build scalebar (scale={})".format(scale)
            # Find the best length of the scalebar
            ar = [1, 2, 5]
            candidates = [1000 * x for x in ar] + [10000 * x for x in ar] + \
                         [100000 * x for x in ar]
            # Find the one that's close to an optimal canvas size
            dists = [(abs(x / scale - .12), x) for x in candidates]
            dist, candidate = min(dists)
            dist = candidate / scale
            x, y, yp = .2, .96, .005
            a, b = x - dist / 2, x + dist / 2
            lsg = "lightslategrey"
            root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg)
            root.plot([a, b], [y, y], "-", lw=2, color=lsg)
            root.text(x, y + .02, human_size(candidate, precision=0),
                      ha="center", va="center")

        if tree:
            from jcvi.graphics.tree import draw_tree, read_trees

            trees = read_trees(tree)
            ntrees = len(trees)
            logging.debug("A total of {0} trees imported.".format(ntrees))
            xiv = 1. / ntrees
            yiv = .3
            xstart = 0
            ystart = min(ymids) - .4
            for i in xrange(ntrees):
                ax = fig.add_axes([xstart, ystart, xiv, yiv])
                label, outgroup, tx = trees[i]
                draw_tree(ax, tx, outgroup=outgroup, rmargin=.4, leaffont=11)
                xstart += xiv
                RoundLabel(ax, .5, .3, label, fill=True, fc="lavender", color="r")
Exemple #35
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None,
                 chr_label=True, loc_label=True,
                 pad=.05, vpad=.015, extra_features=None):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        if layout.label:
            chr = layout.label

        label = "-".join((human_size(startbp, target="Mb", precision=2)[:-2],
                          human_size(endbp, target="Mb", precision=2)))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            self.gg[g.accn] = (a, b)

            color = forward if strand == "+" else backward
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False,
                           fc='#ff7f00', zorder=2)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden:
            bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5)
            kwargs = dict(ha=ha, va="center",
                          rotation=trans_angle, bbox=bbox, zorder=10)

            # TODO: I spent several hours on trying to make this work - with no
            # good solutions. To generate labels on multiple lines, each line
            # with a different style is difficult in matplotlib. The only way,
            # if you can tolerate an extra dot (.), is to use the recipe below.
            #chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None
            #loc_label = r"\noindent . \\ " + label if loc_label else None

            chr_label = markup(chr) if chr_label else None
            loc_label = label if loc_label else None
            if chr_label:
                if loc_label:
                    ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs)
                    ax.text(lx, ly - vpad, loc_label, color="lightslategrey",
                            size=10, **kwargs)
                else:
                    ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
Exemple #36
0
def simple(args):
    """
    %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Write the block ends for each block in the anchorfile.
    GeneA1    GeneA2    GeneB1    GeneB2   +/-      score

    Optional additional columns:
    orderA1   orderA2   orderB1   orderB2  sizeA    sizeB   size    block_id

    With base coordinates (--coords):
    block_id  seqidA    startA    endA     bpSpanA  GeneA1   GeneA2  geneSpanA
    block_id  seqidB    startB    endB     bpSpanB  GeneB1   GeneB2  geneSpanB
    """
    p = OptionParser(simple.__doc__)
    p.add_option("--rich", default=False, action="store_true", \
                help="Output additional columns [default: %default]")
    p.add_option(
        "--coords",
        default=False,
        action="store_true",
        help="Output columns with base coordinates [default: %default]")
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Generate BED file for the blocks")
    p.add_option("--noheader",
                 default=False,
                 action="store_true",
                 help="Don't output header [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    additional = opts.rich
    coords = opts.coords
    header = not opts.noheader
    bed = opts.bed
    if bed:
        coords = True
        bbed = Bed()

    ac = AnchorFile(anchorfile)
    simplefile = anchorfile.rsplit(".", 1)[0] + ".simple"

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    pf = "-".join(anchorfile.split(".", 2)[:2])
    blocks = ac.blocks

    if coords:
        h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation"
    else:
        h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score"
        if additional:
            h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\
                  "SizeA|SizeB|Size|Block"

    fws = open(simplefile, "w")
    if header:
        print >> fws, "\t".join(h.split("|"))

    atotalbase = btotalbase = 0
    for i, block in enumerate(blocks):

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)

        astarti, aendi = min(ia), max(ia)
        bstarti, bendi = min(ib), max(ib)
        astart, aend = min(a)[1].accn, max(a)[1].accn
        bstart, bend = min(b)[1].accn, max(b)[1].accn

        sizeA = len(set(ia))
        sizeB = len(set(ib))
        size = len(block)

        slope, intercept = np.polyfit(ia, ib, 1)
        orientation = "+" if slope >= 0 else '-'
        aspan = aendi - astarti + 1
        bspan = bendi - bstarti + 1
        score = int((aspan * bspan)**.5)
        score = str(score)
        block_id = pf + "-block-{0}".format(i)

        if coords:

            aseqid, astartbase, aendbase = \
                    get_boundary_bases(astart, aend, qorder)
            bseqid, bstartbase, bendbase = \
                    get_boundary_bases(bstart, bend, sorder)
            abase = aendbase - astartbase + 1
            bbase = bendbase - bstartbase + 1
            atotalbase += abase
            btotalbase += bbase

            # Write dual lines
            aargs = [
                block_id, aseqid, astartbase, aendbase, abase, astart, aend,
                aspan, "+"
            ]
            bargs = [
                block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend,
                bspan, orientation
            ]

            if bed:
                bbed.append(BedLine("\t".join(str(x) for x in \
                           (bseqid, bstartbase - 1, bendbase,
                           "{}:{}-{}".format(aseqid, astartbase, aendbase),
                           size, orientation))))

            for args in (aargs, bargs):
                print >> fws, "\t".join(str(x) for x in args)
            continue

        args = [astart, aend, bstart, bend, score, orientation]
        if additional:
            args += [
                astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id
            ]
        print >> fws, "\t".join(str(x) for x in args)

    fws.close()
    logging.debug("A total of {0} blocks written to `{1}`.".format(
        i + 1, simplefile))

    if coords:
        print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \
                        human_size(atotalbase, precision=2))
        print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \
                        human_size(btotalbase, precision=2))
        print >> sys.stderr, "Ratio: {0:.1f}x".format(\
                        max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase))

    if bed:
        bedfile = simplefile + ".bed"
        bbed.print_to_file(filename=bedfile, sorted=True)
        logging.debug("Bed file written to `{}`".format(bedfile))
Exemple #37
0
    def __init__(self,
                 ax,
                 ext,
                 layout,
                 bed,
                 scale,
                 switch=None,
                 chr_label=True,
                 pad=.04,
                 vpad=.012):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si:ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join(
            (human_size(startbp,
                        target="Mb")[:-2], human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2 = cv(gstart), cv(gend)
            a, b = tr.transform((x1, y)), tr.transform((x2, y))
            a, b = inv.transform(a), inv.transform(b)
            self.gg[g.accn] = (a, b)

            color = "b" if strand == "+" else "g"
            if not hidden:
                gp = Glyph(ax,
                           x1,
                           x2,
                           y,
                           height,
                           gradient=False,
                           fc=color,
                           zorder=3)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            ax.text(lx,
                    ly + vpad,
                    markup(chr),
                    color=layout.color,
                    ha=ha,
                    va="center",
                    rotation=trans_angle)
            ax.text(lx,
                    ly - vpad,
                    label,
                    color="k",
                    ha=ha,
                    va="center",
                    rotation=trans_angle)