Esempio n. 1
0
def allelefreq(args):
    """
    %prog allelefreq HD,DM1,SCA1,SCA17

    Plot the allele frequencies of some STRs.
    """
    p = OptionParser(allelefreq.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="10x10")

    if len(args) != 1:
        sys.exit(not p.print_help())

    loci, = args
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2,
                                                 nrows=2,
                                                 figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=4)
    treds, df = read_treds()
    df = df.set_index(["abbreviation"])

    for ax, locus in zip((ax1, ax2, ax3, ax4), loci.split(",")):
        plot_allelefreq(ax, df, locus)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"),
                        (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D")))
    normalize_axes(root)

    image_name = "allelefreq." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 2
0
def compare2(args):
    """
    %prog compare2

    Compare performances of various variant callers on simulated STR datasets.
    """
    p = OptionParser(compare2.__doc__)
    p.add_option('--maxinsert',
                 default=300,
                 type="int",
                 help="Maximum number of repeats")
    add_simulate_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="10x5")

    if len(args) != 0:
        sys.exit(not p.print_help())

    depth = opts.depth
    readlen = opts.readlen
    distance = opts.distance
    max_insert = opts.maxinsert
    fig, (ax1, ax2) = plt.subplots(ncols=2,
                                   nrows=1,
                                   figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=2)

    # ax1: lobSTR vs TREDPARSE with haploid model
    lobstr_results = parse_results("lobstr_results_homo.txt")
    tredparse_results = parse_results("tredparse_results_homo.txt")
    title = SIMULATED_HAPLOID + \
            r" ($D=%s\times, L=%dbp, V=%dbp$)" % (depth, readlen, distance)
    plot_compare(ax1,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    # ax2: lobSTR vs TREDPARSE with diploid model
    lobstr_results = parse_results("lobstr_results_het.txt", exclude=20)
    tredparse_results = parse_results("tredparse_results_het.txt", exclude=20)
    title = SIMULATED_DIPLOID + \
            r" ($D=%s\times, L=%dbp, V=%dbp$)" % (depth, readlen, distance)
    plot_compare(ax2,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    for ax in (ax1, ax2):
        ax.set_xlim(0, max_insert)
        ax.set_ylim(0, max_insert)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B")))
    normalize_axes(root)

    image_name = "tredparse." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 3
0
def compare(args):
    """
    %prog compare Evaluation.csv

    Compare performances of various variant callers on simulated STR datasets.
    """
    p = OptionParser(compare.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="10x10")

    if len(args) != 1:
        sys.exit(not p.print_help())

    datafile, = args
    pf = datafile.rsplit(".", 1)[0]
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2,
                                                 nrows=2,
                                                 figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=3)

    bbox = {'facecolor': 'tomato', 'alpha': .2, 'ec': 'w'}
    pad = 2

    # Read benchmark data
    df = pd.read_csv("Evaluation.csv")
    truth = df["Truth"]
    axes = (ax1, ax2, ax3, ax4)
    progs = ("Manta", "Isaac", "GATK", "lobSTR")
    markers = ("bx-", "yo-", "md-", "c+-")

    for ax, prog, marker in zip(axes, progs, markers):
        ax.plot(truth, df[prog], marker)
        ax.plot(truth, truth, 'k--')  # to show diagonal
        ax.axhline(infected_thr, color='tomato')
        ax.text(max(truth) - pad,
                infected_thr + pad,
                'Risk threshold',
                bbox=bbox,
                ha="right")
        ax.axhline(ref_thr, color='tomato')
        ax.text(max(truth) - pad,
                ref_thr - pad,
                'Reference repeat count',
                bbox=bbox,
                ha="right",
                va="top")
        ax.set_title(SIMULATED_HAPLOID)
        ax.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)')
        ax.set_ylabel('Num of CAG repeats called')
        ax.legend([prog, 'Truth'], loc='best')

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"),
                        (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D")))
    normalize_axes(root)

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 4
0
def multilineplot(args):
    """
    %prog multilineplot fastafile chr1

    Combine multiple line plots in one vertical stack
    Inputs must be BED-formatted.

    --lines: traditional line plots, useful for plotting feature freq
    """
    p = OptionParser(multilineplot.__doc__)
    p.add_option("--lines",
                 help="Features to plot in lineplot [default: %default]")
    p.add_option("--colors",
                 help="List of colors matching number of input bed files")
    p.add_option("--mode", default="span", choices=("span", "count", "score"),
                 help="Accumulate feature based on [default: %default]")
    p.add_option("--binned", default=False, action="store_true",
                 help="Specify whether the input is already binned; " +
                 "if True, input files are considered to be binfiles")
    add_window_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, chr = args
    window, shift, subtract = check_window_options(opts)
    linebeds = []
    colors = opts.colors
    if opts.lines:
        lines = opts.lines.split(",")
        assert len(colors) == len(lines), "Number of chosen colors must match" + \
                " number of input bed files"
        linebeds = get_beds(lines, binned=opts.binned)

    linebins = get_binfiles(linebeds, fastafile, shift, mode=opts.mode, binned=opts.binned)

    clen = Sizes(fastafile).mapping[chr]
    nbins = get_nbins(clen, shift)

    plt.rcParams["xtick.major.size"] = 0
    plt.rcParams["ytick.major.size"] = 0
    plt.rcParams["figure.figsize"] = iopts.w, iopts.h

    fig, axarr = plt.subplots(nrows=len(lines))
    if len(linebeds) == 1:
        axarr = (axarr, )
    fig.suptitle(chr, color="darkslategray")

    for i, ax in enumerate(axarr):
        lineplot(ax, [linebins[i]], nbins, chr, window, shift, \
                color="{0}{1}".format(colors[i], 'r'))

    plt.subplots_adjust(hspace=0.5)

    image_name = chr + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 5
0
def compare(args):
    """
    %prog compare Evaluation.csv

    Compare performances of various variant callers on simulated STR datasets.
    """
    p = OptionParser(__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="15x5")

    if len(args) != 1:
        sys.exit(not p.print_help())

    datafile, = args
    pf = datafile.rsplit(".", 1)[0]
    fig, (ax1, ax2, ax3) = plt.subplots(ncols=3,
                                        nrows=1,
                                        figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=2)

    # Huntington risk allele
    infected_thr = 40
    ref_thr = 19

    # ax1: Multiple callers at lower range
    df = pd.read_csv("Evaluation.csv")
    truth = df["Truth"]

    ax1.plot(truth, df["Manta"], 'bx-')
    ax1.plot(truth, df["Isaac"], 'yo-')
    ax1.plot(truth, df["GATK"], 'md-')
    ax1.plot(truth, df["lobSTR"], 'c+-')
    ax1.plot(truth, truth, 'k--')  # to show diagonal

    bbox = {'facecolor': 'tomato', 'alpha': .2, 'ec': 'w'}
    pad = 2
    ax1.axhline(infected_thr, color='tomato')
    ax1.text(max(truth) - pad,
             infected_thr + pad,
             'Risk threshold',
             bbox=bbox,
             ha="right")
    ax1.axhline(ref_thr, color='tomato')
    ax1.text(max(truth) - pad,
             ref_thr - pad,
             'Reference repeat count',
             bbox=bbox,
             ha="right",
             va="top")

    ax1.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)')
    ax1.set_ylabel('Num of CAG repeats called')
    ax1.set_title(r'Simulated haploid $\mathit{h}$')
    ax1.legend(['Manta', 'Isaac', 'GATK', 'lobSTR', 'Truth'], loc='best')

    max_insert = 120
    # ax2: lobSTR vs TREDPARSE with haploid model
    lobstr_results = parse_results("lobstr_results_homo.txt")
    tredparse_results = parse_results("tredparse_results_homo.txt")
    truth = range(10, max_insert + 1)
    lx, ly = zip(*lobstr_results)
    tx, ty = zip(*tredparse_results)

    ax2.plot(lx, ly, 'c+-')
    ax2.plot(tx, ty, 'gx-')
    ax2.plot(truth, truth, 'k--')

    ax2.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)')
    ax2.set_ylabel('Num of CAG repeats called')
    ax2.set_title(r'Simulated haploid $\mathit{h}$')
    ax2.legend(['lobSTR', 'TREDPARSE', 'Truth'], loc='best')

    pad *= 2
    ax2.axhline(infected_thr, color='tomato')
    ax2.text(max(truth) - pad,
             infected_thr + pad,
             'Risk threshold',
             bbox=bbox,
             ha="right")
    ax2.set_xlim(10, max_insert)

    # ax3: lobSTR vs TREDPARSE with haploid model
    lobstr_results = parse_results("lobstr_results_het.txt", exclude=20)
    tredparse_results = parse_results("tredparse_results_het.txt", exclude=20)
    truth = range(10, max_insert + 1)
    lx, ly = zip(*lobstr_results)
    tx, ty = zip(*tredparse_results)

    ax3.plot(lx, ly, 'c+-')
    ax3.plot(tx, ty, 'gx-')
    ax3.plot(truth, truth, 'k--')

    ax3.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)')
    ax3.set_ylabel('Num of CAG repeats called')
    ax3.set_title(r'Simulated diploid $\mathit{20/h}$')
    ax3.legend(['lobSTR', 'TREDPARSE', 'Truth'], loc='best')
    ax3.axhline(infected_thr, color='tomato')
    ax3.text(max(truth) - pad,
             infected_thr + pad,
             'Risk threshold',
             bbox=bbox,
             ha="right")
    ax3.set_xlim(10, max_insert)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 3., 1 - pad, "B"),
                        (2 / 3., 1 - pad, "C")))
    normalize_axes(root)

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 6
0
def seeds(args):
    """
    %prog seeds [pngfile|jpgfile]

    Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image.
    """
    p = OptionParser(seeds.__doc__)
    p.set_outfile()
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pngfile, = args
    pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0]
    sigma, kernel = opts.sigma, opts.kernel
    rows, cols = opts.rows, opts.cols
    labelrows, labelcols = opts.labelrows, opts.labelcols
    ff = opts.filter
    calib = opts.calibrate
    outdir = opts.outdir
    if outdir != '.':
        mkdir(outdir)
    if calib:
        calib = json.load(must_open(calib))
        pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"]
        tr = np.array(tr)

    resizefile, mainfile, labelfile, exif = \
                      convert_image(pngfile, pf, outdir=outdir,
                                    rotate=opts.rotate,
                                    rows=rows, cols=cols,
                                    labelrows=labelrows, labelcols=labelcols)

    oimg = load_image(resizefile)
    img = load_image(mainfile)

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, nrows=1,
                                             figsize=(iopts.w, iopts.h))

    # Edge detection
    img_gray = rgb2gray(img)
    logging.debug("Running {0} edge detection ...".format(ff))
    if ff == "canny":
        edges = canny(img_gray, sigma=opts.sigma)
    elif ff == "roberts":
        edges = roberts(img_gray)
    elif ff == "sobel":
        edges = sobel(img_gray)
    edges = clear_border(edges, buffer_size=opts.border)
    selem = disk(kernel)
    closed = closing(edges, selem) if kernel else edges
    filled = binary_fill_holes(closed)

    # Watershed algorithm
    if opts.watershed:
        distance = distance_transform_edt(filled)
        local_maxi = peak_local_max(distance, threshold_rel=.05, indices=False)
        coordinates = peak_local_max(distance, threshold_rel=.05)
        markers, nmarkers = label(local_maxi, return_num=True)
        logging.debug("Identified {0} watershed markers".format(nmarkers))
        labels = watershed(closed, markers, mask=filled)
    else:
        labels = label(filled)

    # Object size filtering
    w, h = img_gray.shape
    canvas_size = w * h
    min_size = int(round(canvas_size * opts.minsize / 100))
    max_size = int(round(canvas_size * opts.maxsize / 100))
    logging.debug("Find objects with pixels between {0} ({1}%) and {2} ({3}%)"\
                    .format(min_size, opts.minsize, max_size, opts.maxsize))

    # Plotting
    ax1.set_title('Original picture')
    ax1.imshow(oimg)

    params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel)
    if opts.watershed:
        params += ", watershed"
    ax2.set_title('Edge detection\n({0})'.format(params))
    closed = gray2rgb(closed)
    ax2_img = labels
    if opts.edges:
        ax2_img = closed
    elif opts.watershed:
        ax2.plot(coordinates[:, 1], coordinates[:, 0], 'g.')
    ax2.imshow(ax2_img, cmap=iopts.cmap)

    ax3.set_title('Object detection')
    ax3.imshow(img)

    filename = op.basename(pngfile)
    if labelfile:
        accession = extract_label(labelfile)
    else:
        accession = pf

    # Calculate region properties
    rp = regionprops(labels)
    rp = [x for x in rp if min_size <= x.area <= max_size]
    nb_labels = len(rp)
    logging.debug("A total of {0} objects identified.".format(nb_labels))
    objects = []
    for i, props in enumerate(rp):
        i += 1
        if i > opts.count:
            break

        y0, x0 = props.centroid
        orientation = props.orientation
        major, minor = props.major_axis_length, props.minor_axis_length
        major_dx = cos(orientation) * major / 2
        major_dy = sin(orientation) * major / 2
        minor_dx = sin(orientation) * minor / 2
        minor_dy = cos(orientation) * minor / 2
        ax2.plot((x0 - major_dx, x0 + major_dx),
                 (y0 + major_dy, y0 - major_dy), 'r-')
        ax2.plot((x0 - minor_dx, x0 + minor_dx),
                 (y0 - minor_dy, y0 + minor_dy), 'r-')

        npixels = int(props.area)
        # Sample the center of the blob for color
        d = min(int(round(minor / 2 * .35)) + 1, 50)
        x0d, y0d = int(round(x0)), int(round(y0))
        square = img[(y0d - d):(y0d + d), (x0d - d):(x0d + d)]
        pixels = []
        for row in square:
            pixels.extend(row)
        logging.debug("Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".\
                        format(i, npixels, len(pixels), 100. * npixels / canvas_size))

        rgb = pixel_stats(pixels)
        objects.append(Seed(filename, accession, i, rgb, props, exif))
        minr, minc, maxr, maxc = props.bbox
        rect = Rectangle((minc, minr), maxc - minc, maxr - minr,
                                  fill=False, ec='w', lw=1)
        ax3.add_patch(rect)
        mc, mr = (minc + maxc) / 2, (minr + maxr) / 2
        ax3.text(mc, mr, "{0}".format(i), color='w',
                    ha="center", va="center", size=6)

    for ax in (ax2, ax3):
        ax.set_xlim(0, h)
        ax.set_ylim(w, 0)

    # Output identified seed stats
    ax4.text(.1, .92, "File: {0}".format(latex(filename)), color='g')
    ax4.text(.1, .86, "Label: {0}".format(latex(accession)), color='m')
    yy = .8
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print(Seed.header(calibrate=calib), file=fw)
    for o in objects:
        if calib:
            o.calibrate(pixel_cm_ratio, tr)
        print(o, file=fw)
        i = o.seedno
        if i > 7:
            continue
        ax4.text(.01, yy, str(i), va="center", bbox=dict(fc='none', ec='k'))
        ax4.text(.1, yy, o.pixeltag, va="center")
        yy -= .04
        ax4.add_patch(Rectangle((.1, yy - .025), .12, .05, lw=0,
                      fc=rgb_to_hex(o.rgb)))
        ax4.text(.27, yy, o.hashtag, va="center")
        yy -= .06
    ax4.text(.1 , yy, "(A total of {0} objects displayed)".format(nb_labels),
             color="darkslategrey")
    normalize_axes(ax4)

    for ax in (ax1, ax2, ax3):
        xticklabels = [int(x) for x in ax.get_xticks()]
        yticklabels = [int(x) for x in ax.get_yticks()]
        ax.set_xticklabels(xticklabels, family='Helvetica', size=8)
        ax.set_yticklabels(yticklabels, family='Helvetica', size=8)

    image_name = op.join(outdir, pf + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    return objects
Esempio n. 7
0
def seeds(args):
    """
    %prog seeds [pngfile|jpgfile]

    Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image.
    """
    p = OptionParser(seeds.__doc__)
    p.set_outfile()
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (pngfile, ) = args
    pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0]
    sigma, kernel = opts.sigma, opts.kernel
    rows, cols = opts.rows, opts.cols
    labelrows, labelcols = opts.labelrows, opts.labelcols
    ff = opts.filter
    calib = opts.calibrate
    outdir = opts.outdir
    if outdir != ".":
        mkdir(outdir)
    if calib:
        calib = json.load(must_open(calib))
        pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"]
        tr = np.array(tr)
    nbcolor = opts.changeBackground
    pngfile = convert_background(pngfile, nbcolor)
    resizefile, mainfile, labelfile, exif = convert_image(
        pngfile,
        pf,
        outdir=outdir,
        rotate=opts.rotate,
        rows=rows,
        cols=cols,
        labelrows=labelrows,
        labelcols=labelcols,
    )
    oimg = load_image(resizefile)
    img = load_image(mainfile)

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4,
                                             nrows=1,
                                             figsize=(iopts.w, iopts.h))
    # Edge detection
    img_gray = rgb2gray(img)
    logging.debug("Running {0} edge detection ...".format(ff))
    if ff == "canny":
        edges = canny(img_gray, sigma=opts.sigma)
    elif ff == "roberts":
        edges = roberts(img_gray)
    elif ff == "sobel":
        edges = sobel(img_gray)
    edges = clear_border(edges, buffer_size=opts.border)
    selem = disk(kernel)
    closed = closing(edges, selem) if kernel else edges
    filled = binary_fill_holes(closed)

    # Watershed algorithm
    if opts.watershed:
        distance = distance_transform_edt(filled)
        local_maxi = peak_local_max(distance,
                                    threshold_rel=0.05,
                                    indices=False)
        coordinates = peak_local_max(distance, threshold_rel=0.05)
        markers, nmarkers = label(local_maxi, return_num=True)
        logging.debug("Identified {0} watershed markers".format(nmarkers))
        labels = watershed(closed, markers, mask=filled)
    else:
        labels = label(filled)

    # Object size filtering
    w, h = img_gray.shape
    canvas_size = w * h
    min_size = int(round(canvas_size * opts.minsize / 100))
    max_size = int(round(canvas_size * opts.maxsize / 100))
    logging.debug(
        "Find objects with pixels between {0} ({1}%) and {2} ({3}%)".format(
            min_size, opts.minsize, max_size, opts.maxsize))

    # Plotting
    ax1.set_title("Original picture")
    ax1.imshow(oimg)

    params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel)
    if opts.watershed:
        params += ", watershed"
    ax2.set_title("Edge detection\n({0})".format(params))
    closed = gray2rgb(closed)
    ax2_img = labels
    if opts.edges:
        ax2_img = closed
    elif opts.watershed:
        ax2.plot(coordinates[:, 1], coordinates[:, 0], "g.")
    ax2.imshow(ax2_img, cmap=iopts.cmap)

    ax3.set_title("Object detection")
    ax3.imshow(img)

    filename = op.basename(pngfile)
    if labelfile:
        accession = extract_label(labelfile)
    else:
        accession = pf

    # Calculate region properties
    rp = regionprops(labels)
    rp = [x for x in rp if min_size <= x.area <= max_size]
    nb_labels = len(rp)
    logging.debug("A total of {0} objects identified.".format(nb_labels))
    objects = []
    for i, props in enumerate(rp):
        i += 1
        if i > opts.count:
            break

        y0, x0 = props.centroid
        orientation = props.orientation
        major, minor = props.major_axis_length, props.minor_axis_length
        major_dx = cos(orientation) * major / 2
        major_dy = sin(orientation) * major / 2
        minor_dx = sin(orientation) * minor / 2
        minor_dy = cos(orientation) * minor / 2
        ax2.plot((x0 - major_dx, x0 + major_dx),
                 (y0 + major_dy, y0 - major_dy), "r-")
        ax2.plot((x0 - minor_dx, x0 + minor_dx),
                 (y0 - minor_dy, y0 + minor_dy), "r-")

        npixels = int(props.area)
        # Sample the center of the blob for color
        d = min(int(round(minor / 2 * 0.35)) + 1, 50)
        x0d, y0d = int(round(x0)), int(round(y0))
        square = img[(y0d - d):(y0d + d), (x0d - d):(x0d + d)]
        pixels = []
        for row in square:
            pixels.extend(row)
        logging.debug("Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".format(
            i, npixels, len(pixels), 100.0 * npixels / canvas_size))

        rgb = pixel_stats(pixels)
        objects.append(Seed(filename, accession, i, rgb, props, exif))
        minr, minc, maxr, maxc = props.bbox
        rect = Rectangle((minc, minr),
                         maxc - minc,
                         maxr - minr,
                         fill=False,
                         ec="w",
                         lw=1)
        ax3.add_patch(rect)
        mc, mr = (minc + maxc) / 2, (minr + maxr) / 2
        ax3.text(mc,
                 mr,
                 "{0}".format(i),
                 color="w",
                 ha="center",
                 va="center",
                 size=6)

    for ax in (ax2, ax3):
        ax.set_xlim(0, h)
        ax.set_ylim(w, 0)

    # Output identified seed stats
    ax4.text(0.1, 0.92, "File: {0}".format(latex(filename)), color="g")
    ax4.text(0.1, 0.86, "Label: {0}".format(latex(accession)), color="m")
    yy = 0.8
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print(Seed.header(calibrate=calib), file=fw)
    for o in objects:
        if calib:
            o.calibrate(pixel_cm_ratio, tr)
        print(o, file=fw)
        i = o.seedno
        if i > 7:
            continue
        ax4.text(0.01, yy, str(i), va="center", bbox=dict(fc="none", ec="k"))
        ax4.text(0.1, yy, o.pixeltag, va="center")
        yy -= 0.04
        ax4.add_patch(
            Rectangle((0.1, yy - 0.025),
                      0.12,
                      0.05,
                      lw=0,
                      fc=rgb_to_hex(o.rgb)))
        ax4.text(0.27, yy, o.hashtag, va="center")
        yy -= 0.06
    ax4.text(
        0.1,
        yy,
        "(A total of {0} objects displayed)".format(nb_labels),
        color="darkslategray",
    )
    normalize_axes(ax4)

    for ax in (ax1, ax2, ax3):
        xticklabels = [int(x) for x in ax.get_xticks()]
        yticklabels = [int(x) for x in ax.get_yticks()]
        ax.set_xticklabels(xticklabels, family="Helvetica", size=8)
        ax.set_yticklabels(yticklabels, family="Helvetica", size=8)

    image_name = op.join(outdir, pf + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    return objects
Esempio n. 8
0
def compare4(args):
    """
    %prog compare4

    Compare performances of various variant callers on simulated STR datasets.
    Adds coverage comparisons as panel C and D.
    """
    p = OptionParser(compare4.__doc__)
    p.add_option('--maxinsert',
                 default=300,
                 type="int",
                 help="Maximum number of repeats")
    add_simulate_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="10x10")

    if len(args) != 0:
        sys.exit(not p.print_help())

    depth = opts.depth
    max_insert = opts.maxinsert
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2,
                                                 nrows=2,
                                                 figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=3)

    # ax1: lobSTR vs TREDPARSE with haploid model
    lobstr_results = parse_results("lobstr_results_homo-20x-150bp-500bp.txt")
    tredparse_results = parse_results(
        "tredparse_results_homo-20x-150bp-500bp.txt")
    title = SIMULATED_HAPLOID + r" ($Depth=%s\times)" % depth
    plot_compare(ax1,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    # ax2: lobSTR vs TREDPARSE with diploid model (depth=20x)
    lobstr_results = parse_results("lobstr_results_het-20x-150bp-500bp.txt",
                                   exclude=20)
    tredparse_results = parse_results(
        "tredparse_results_het-20x-150bp-500bp.txt", exclude=20)
    title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % depth
    plot_compare(ax2,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    # ax3: lobSTR vs TREDPARSE with diploid model (depth=5x)
    lobstr_results = parse_results("lobstr_results_het-5x-150bp-500bp.txt",
                                   exclude=20)
    tredparse_results = parse_results(
        "tredparse_results_het-5x-150bp-500bp.txt", exclude=20)
    title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % 5
    plot_compare(ax3,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    # ax4: lobSTR vs TREDPARSE with diploid model (depth=80x)
    lobstr_results = parse_results("lobstr_results_het-80x-150bp-500bp.txt",
                                   exclude=20)
    tredparse_results = parse_results(
        "tredparse_results_het-80x-150bp-500bp.txt", exclude=20)
    title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % 80
    plot_compare(ax4,
                 title,
                 tredparse_results,
                 lobstr_results,
                 max_insert=max_insert)

    for ax in (ax1, ax2, ax3, ax4):
        ax.set_xlim(0, max_insert)
        ax.set_ylim(0, max_insert)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"),
                        (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D")))
    normalize_axes(root)

    image_name = "tredparse." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 9
0
def compare3(args):
    """
    %prog compare3

    Compare performances of various variant callers on simulated STR datasets.
    This compares the power of various evidence types.
    """
    p = OptionParser(compare3.__doc__)
    p.add_option('--maxinsert',
                 default=300,
                 type="int",
                 help="Maximum number of repeats")
    add_simulate_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="10x10")

    if len(args) != 0:
        sys.exit(not p.print_help())

    max_insert = opts.maxinsert
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2,
                                                 nrows=2,
                                                 figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=3)

    color = "lightslategray"
    # ax1: Spanning
    tredparse_results = parse_results("tredparse_results_het-spanning.txt")
    title = SIMULATED_DIPLOID + "( Sub-model 1: Spanning reads)"
    plot_compare(ax1,
                 title,
                 tredparse_results,
                 None,
                 color=color,
                 max_insert=max_insert,
                 risk=False)

    # ax2: Partial
    tredparse_results = parse_results("tredparse_results_het-partial.txt",
                                      exclude=20)
    title = SIMULATED_DIPLOID + " (Sub-model 2: Partial reads)"
    plot_compare(ax2,
                 title,
                 tredparse_results,
                 None,
                 color=color,
                 max_insert=max_insert,
                 risk=False)

    # ax3: Repeat
    tredparse_results = parse_results("tredparse_results_het-repeat.txt",
                                      exclude=20)
    # HACK (repeat reads won't work under 50)
    tredparse_results = [x for x in tredparse_results if x[0] > 50]
    title = SIMULATED_DIPLOID + " (Sub-model 3: Repeat-only reads)"
    plot_compare(ax3,
                 title,
                 tredparse_results,
                 None,
                 color=color,
                 max_insert=max_insert,
                 risk=False)

    # ax4: Pair
    tredparse_results = parse_results("tredparse_results_het-pair.txt",
                                      exclude=20)
    title = SIMULATED_DIPLOID + " (Sub-model 4: Paired-end reads)"
    plot_compare(ax4,
                 title,
                 tredparse_results,
                 None,
                 color=color,
                 max_insert=max_insert,
                 risk=False)

    for ax in (ax1, ax2, ax3, ax4):
        ax.set_xlim(0, max_insert)
        ax.set_ylim(0, max_insert)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .03
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"),
                        (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D")))
    normalize_axes(root)

    image_name = "tredparse." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Esempio n. 10
0
def depth(args):
    """
    %prog depth anchorfile --qbed qbedfile --sbed sbedfile

    Calculate the depths in the two genomes in comparison, given in --qbed and
    --sbed. The synteny blocks will be layered on the genomes, and the
    multiplicity will be summarized to stderr.
    """
    from jcvi.utils.range import range_depth

    p = OptionParser(depth.__doc__)
    p.add_option("--depthfile",
                 help="Generate file with gene and depth [default: %default]")
    p.add_option("--histogram", default=False, action="store_true",
                 help="Plot histograms in PDF")
    p.add_option("--xmax", type="int", help="x-axis maximum to display in plot")
    p.add_option("--title", default=None, help="Title to display in plot")
    p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    depthfile = opts.depthfile
    ac = AnchorFile(anchorfile)
    qranges = []
    sranges = []
    blocks = ac.blocks
    for ib in blocks:
        q, s, t = zip(*ib)
        q = [qorder[x] for x in q]
        s = [sorder[x] for x in s]
        qrange = (min(q)[0], max(q)[0])
        srange = (min(s)[0], max(s)[0])
        qranges.append(qrange)
        sranges.append(srange)
        if is_self:
            qranges.append(srange)

    qgenome = op.basename(qbed.filename).split(".")[0]
    sgenome = op.basename(sbed.filename).split(".")[0]
    qtag = "Genome {0} depths".format(qgenome)
    print("{}:".format(qtag), file=sys.stderr)
    dsq, details = range_depth(qranges, len(qbed))
    if depthfile:
        fw = open(depthfile, "w")
        write_details(fw, details, qbed)

    if is_self:
        return

    stag = "Genome {0} depths".format(sgenome)
    print("{}:".format(stag), file=sys.stderr)
    dss, details = range_depth(sranges, len(sbed))
    if depthfile:
        write_details(fw, details, sbed)
        fw.close()
        logging.debug("Depth written to `{0}`.".format(depthfile))

    if not opts.histogram:
        return

    from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes

    # Plot two histograms one for query genome, one for subject genome
    plt.figure(1, (6, 3))
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys()))
    if opts.quota:
        speak, qpeak = opts.quota.split(":")
        qpeak, speak = int(qpeak), int(speak)
    else:
        qpeak = find_peak(dsq)
        speak = find_peak(dss)

    qtag = "# of {} blocks per {} gene".format(sgenome, qgenome)
    stag = "# of {} blocks per {} gene".format(qgenome, sgenome)
    quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome",
                 highlight=range(1, speak + 1))
    quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None,
                 highlight=range(1, qpeak + 1))

    title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\
                    .format(qgenome, sgenome, speak, qpeak)
    root = f.add_axes([0, 0, 1, 1])
    vs, pattern = title.split('\n')
    root.text(.5, .97, vs, ha="center", va="center", color="darkslategray")
    root.text(.5, .925, pattern, ha="center", va="center",
                                 color="tomato", size=16)
    print(title, file=sys.stderr)

    normalize_axes(root)

    pf = anchorfile.rsplit(".", 1)[0] + ".depth"
    image_name = pf + ".pdf"
    savefig(image_name)
Esempio n. 11
0
def depth(args):
    """
    %prog depth anchorfile --qbed qbedfile --sbed sbedfile

    Calculate the depths in the two genomes in comparison, given in --qbed and
    --sbed. The synteny blocks will be layered on the genomes, and the
    multiplicity will be summarized to stderr.
    """
    from jcvi.utils.range import range_depth

    p = OptionParser(depth.__doc__)
    p.add_option("--depthfile",
                 help="Generate file with gene and depth [default: %default]")
    p.add_option("--histogram", default=False, action="store_true",
                 help="Plot histograms in PDF")
    p.add_option("--xmax", type="int", help="x-axis maximum to display in plot")
    p.add_option("--title", default=None, help="Title to display in plot")
    p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    depthfile = opts.depthfile
    ac = AnchorFile(anchorfile)
    qranges = []
    sranges = []
    blocks = ac.blocks
    for ib in blocks:
        q, s, t = zip(*ib)
        q = [qorder[x] for x in q]
        s = [sorder[x] for x in s]
        qrange = (min(q)[0], max(q)[0])
        srange = (min(s)[0], max(s)[0])
        qranges.append(qrange)
        sranges.append(srange)
        if is_self:
            qranges.append(srange)

    qgenome = op.basename(qbed.filename).split(".")[0]
    sgenome = op.basename(sbed.filename).split(".")[0]
    qtag = "Genome {0} depths".format(qgenome)
    print >> sys.stderr, "{}:".format(qtag)
    dsq, details = range_depth(qranges, len(qbed))
    if depthfile:
        fw = open(depthfile, "w")
        write_details(fw, details, qbed)

    if is_self:
        return

    stag = "Genome {0} depths".format(sgenome)
    print >> sys.stderr, "{}:".format(stag)
    dss, details = range_depth(sranges, len(sbed))
    if depthfile:
        write_details(fw, details, sbed)
        fw.close()
        logging.debug("Depth written to `{0}`.".format(depthfile))

    if not opts.histogram:
        return

    from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes

    # Plot two histograms one for query genome, one for subject genome
    plt.figure(1, (6, 3))
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys()))
    if opts.quota:
        speak, qpeak = opts.quota.split(":")
        qpeak, speak = int(qpeak), int(speak)
    else:
        qpeak = find_peak(dsq)
        speak = find_peak(dss)

    qtag = "# of {} blocks per {} gene".format(sgenome, qgenome)
    stag = "# of {} blocks per {} gene".format(qgenome, sgenome)
    quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome",
                 highlight=range(1, speak + 1))
    quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None,
                 highlight=range(1, qpeak + 1))

    title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\
                    .format(qgenome, sgenome, speak, qpeak)
    root = f.add_axes([0, 0, 1, 1])
    vs, pattern = title.split('\n')
    root.text(.5, .97, vs, ha="center", va="center", color="darkslategray")
    root.text(.5, .925, pattern, ha="center", va="center",
                                 color="tomato", size=16)
    print >> sys.stderr, title

    normalize_axes(root)

    pf = anchorfile.rsplit(".", 1)[0] + ".depth"
    image_name = pf + ".pdf"
    savefig(image_name)
Esempio n. 12
0
def likelihood(args):
    """
    %prog likelihood

    Plot likelihood surface. Look for two files in the current folder:
    - 100_100.log, haploid model
    - 100_20.log, diploid model
    """
    p = OptionParser(likelihood.__doc__)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="10x5",
                                            style="white",
                                            cmap="coolwarm")

    if len(args) != 0:
        sys.exit(not p.print_help())

    fig, (ax1, ax2) = plt.subplots(ncols=2,
                                   nrows=1,
                                   figsize=(iopts.w, iopts.h))
    plt.tight_layout(pad=4)

    # Haploid model
    LL, CI_h1, CI_h2, MLE = parse_log("100_100.log")
    data = []
    for k, v in LL.items():
        data.append((k[0], v))
    data.sort()
    x, y = zip(*data)
    x = np.array(x)
    curve, = ax1.plot(x, y, "-", color=lsg, lw=2)
    ax1.set_title("Simulated haploid ($h^{truth}=100$)")

    h_hat, max_LL = max(data, key=lambda x: x[-1])
    _, min_LL = min(data, key=lambda x: x[-1])
    ymin, ymax = ax1.get_ylim()
    ax1.set_ylim([ymin, ymax + 30])

    LL_label = "log(Likelihood)"
    ax1.plot([h_hat, h_hat], [ymin, max_LL], ":", color=lsg, lw=2)
    ax1.text(h_hat, max_LL + 10, r"$\hat{h}=93$", color=lsg)
    ax1.set_xlabel(r"$h$")
    ax1.set_ylabel(LL_label)

    a, b = CI_h1
    ci = ax1.fill_between(x, [ymin] * len(x),
                          y,
                          where=(x >= a) & (x <= b),
                          color=lsg,
                          alpha=.5)
    ax1.legend([curve, ci], ["Likelihood curve", r'95$\%$ CI'], loc='best')

    # Diploid model
    LL, CI_h1, CI_h2, MLE = parse_log("100_20.log")
    h_hat, max_LL = max(data, key=lambda x: x[-1])
    _, min_LL = min(data, key=lambda x: x[-1])
    data = np.ones((301, 301)) * min_LL
    for k, v in LL.items():
        a, b = k
        data[a, b] = v
        data[b, a] = v

    data = mask_upper_triangle(data)
    ax_imshow(ax2, data, opts.cmap, LL_label, 20, 104)

    root = fig.add_axes([0, 0, 1, 1])
    pad = .04
    panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B")))
    normalize_axes(root)

    image_name = "likelihood." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)