Beispiel #1
0
def main(args=None):
    args = process_args(args)

    cr = countR.CountReadsPerBin(
        args.bamfiles,
        args.binSize,
        args.numberOfSamples,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    i = 0
    for reads in num_reads_per_bin.T:
        count = np.cumsum(np.sort(reads))
        count = count / count[-1]  # to normalyze y from 0 to 1
        plt.plot(x, count, label=args.labels[i])
        plt.xlabel('rank')
        plt.ylabel('fraction w.r.t. bin with highest coverage')
        i += 1
    plt.legend(loc='upper left')
    plt.suptitle(args.plotTitle)
    # set the plotFileFormat explicitly to None to trigger the
    # format from the file-extension
    if not args.plotFileFormat:
        args.plotFileFormat = None

    plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat)
    plt.close()

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
def main(args=None):
    args = process_args(args)

    cr = countR.CountReadsPerBin(args.bamfiles,
                                 args.binSize,
                                 args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    i = 0
    for reads in num_reads_per_bin.T:
        count = np.cumsum(np.sort(reads))
        count = count / count[-1]  # to normalyze y from 0 to 1
        plt.plot(x, count, label=args.labels[i])
        plt.xlabel('rank')
        plt.ylabel('fraction w.r.t. bin with highest coverage')
        i += 1
    plt.legend(loc='upper left')
    plt.suptitle(args.plotTitle)
    # set the plotFileFormat explicitly to None to trigger the
    # format from the file-extension
    if not args.plotFileFormat:
        args.plotFileFormat = None

    plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat)

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
Beispiel #3
0
def main(args=None):
    args = process_args(args)

    if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics:
        sys.exit(
            "At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n"
        )

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 bedFile=bed_regions,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 bed_and_bin=True,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    if args.outCoverageMetrics and args.coverageThresholds:
        args.coverageThresholds.sort(
        )  # Galaxy in particular tends to give things in a weird order
        of = open(args.outCoverageMetrics, "w")
        of.write("Sample\tThreshold\tPercent\n")
        nbins = float(num_reads_per_bin.shape[0])
        for thresh in args.coverageThresholds:
            vals = np.sum(num_reads_per_bin >= thresh, axis=0)
            for lab, val in zip(args.labels, vals):
                of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh,
                                                    100. * val / nbins))
        of.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    if args.plotFile:
        if args.plotFileFormat == 'plotly':
            fig = go.Figure()
            fig['layout']['xaxis1'] = {
                'domain': [0.0, 0.48],
                'anchor': 'x1',
                'title': 'coverage (#reads per base)'
            }
            fig['layout']['xaxis2'] = {
                'domain': [0.52, 1.0],
                'anchor': 'x2',
                'title': 'coverage (#reads per base)'
            }
            fig['layout']['yaxis1'] = {
                'domain': [0.0, 1.0],
                'anchor': 'x1',
                'title': 'fraction of bases sampled'
            }
            fig['layout']['yaxis2'] = {
                'domain': [0.0, 1.0],
                'anchor': 'x2',
                'title': 'fraction of bases sampled >= coverage'
            }
            fig['layout'].update(title=args.plotTitle)
        else:
            fig, axs = plt.subplots(1,
                                    2,
                                    figsize=(args.plotWidth, args.plotHeight))
            plt.suptitle(args.plotTitle)

    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    data = []
    # We need to manually set the line colors so they're shared between the two plots.
    plotly_colors = [
        "#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"
    ]
    plotly_styles = sum([
        6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"],
        6 * ["dashdot"], 6 * ["longdashdot"]
    ], [])
    for idx, col in enumerate(num_reads_per_bin.T):
        if args.plotFile:
            frac_reads_per_coverage = np.bincount(
                col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
            csum = np.bincount(col.astype(int))[::-1].cumsum()
            csum_frac = csum.astype(float)[::-1] / csum.max()
            if args.plotFileFormat == 'plotly':
                color = plotly_colors[idx % len(plotly_colors)]
                dash = plotly_styles[idx % len(plotly_styles)]
                trace = go.Scatter(x=np.arange(0,
                                               int(x_max) - 1),
                                   y=frac_reads_per_coverage[:int(x_max)],
                                   mode='lines',
                                   xaxis='x1',
                                   yaxis='y1',
                                   line=dict(color=color, dash=dash),
                                   name="{}, mean={:.1f}".format(
                                       args.labels[idx], sample_mean[idx]),
                                   legendgroup="{}".format(idx))
                data.append(trace)
                trace = go.Scatter(x=np.arange(0,
                                               int(x_max) - 1),
                                   y=csum_frac[:int(x_max)],
                                   mode='lines',
                                   xaxis='x2',
                                   yaxis='y2',
                                   line=dict(color=color, dash=dash),
                                   name=args.labels[idx],
                                   showlegend=False,
                                   legendgroup="{}".format(idx))
                data.append(trace)
            else:
                axs[0].plot(frac_reads_per_coverage,
                            label="{}, mean={:.1f}".format(
                                args.labels[idx], sample_mean[idx]))
                axs[1].plot(csum_frac, label=args.labels[idx])
            # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
            # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
            # then find the fraction of bases sampled that that have the largest x
            y_max.append(frac_reads_per_coverage[max(
                np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(
            args.labels[idx],
            sample_mean[idx],
            sample_std[idx],
            sample_min[idx],
            sample_25[idx],
            sample_50[idx],
            sample_75[idx],
            sample_max[idx],
        ))

    if args.plotFile:
        # Don't clip plots
        y_max = max(y_max)
        if args.plotFileFormat == "plotly":
            fig['data'] = data
            fig['layout']['yaxis1'].update(
                range=[0.0, min(1, y_max + (y_max * 0.10))])
            fig['layout']['yaxis2'].update(range=[0.0, 1.0])
            py.plot(fig, filename=args.plotFile, auto_open=False)
        else:
            axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
            axs[0].set_xlim(0, x_max)
            axs[0].set_xlabel('coverage (#reads per bp)')
            axs[0].legend(fancybox=True, framealpha=0.5)
            axs[0].set_ylabel('fraction of bases sampled')
            # plot cumulative coverage
            axs[1].set_xlim(0, x_max)
            axs[1].set_xlabel('coverage (#reads per bp)')
            axs[1].set_ylabel('fraction of bases sampled >= coverage')
            axs[1].legend(fancybox=True, framealpha=0.5)
            plt.savefig(args.plotFile, format=args.plotFileFormat)
            plt.close()
Beispiel #4
0
def main(args=None):
    args = process_args(args)

    if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics:
        sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n")
        sys.exit(1)

    cr = sumR.SumCoveragePerBin(
        args.bamfiles,
        args.binSize,
        args.numberOfSamples,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude,
        minFragmentLength=args.minFragmentLength,
        maxFragmentLength=args.maxFragmentLength)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    if args.plotFile is not None:
        i = 0
        # matplotlib won't iterate through line styles by itself
        pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], [])
        plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"]
        plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], [])
        data = []
        for i, reads in enumerate(num_reads_per_bin.T):
            count = np.cumsum(np.sort(reads))
            count = count / count[-1]  # to normalize y from 0 to 1
            if args.plotFileFormat == 'plotly':
                trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i])
                trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6])
                data.append(trace)
            else:
                j = i % 35
                plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j])
                plt.xlabel('rank')
                plt.ylabel('fraction w.r.t. bin with highest coverage')
        # set the plotFileFormat explicitly to None to trigger the
        # format from the file-extension
        if not args.plotFileFormat:
            args.plotFileFormat = None

        if args.plotFileFormat == 'plotly':
            fig = go.Figure()
            fig['data'] = data
            fig['layout'].update(title=args.plotTitle)
            fig['layout']['xaxis1'].update(title="rank")
            fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage")
            py.plot(fig, filename=args.plotFile, auto_open=False)
        else:
            plt.legend(loc='upper left')
            plt.suptitle(args.plotTitle)
            plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat)
            plt.close()

    if args.outRawCounts is not None:
        of = open(args.outRawCounts, "w")
        of.write("#plotFingerprint --outRawCounts\n")
        of.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            of.write(fmt % tuple(row))
        of.close()

    if args.outQualityMetrics is not None:
        of = open(args.outQualityMetrics, "w")
        of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point")
        if args.JSDsample:
            of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence")
        else:
            of.write("\tSynthetic JS Distance")
        of.write("\n")
        line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
        for idx, reads in enumerate(num_reads_per_bin.T):
            counts = np.cumsum(np.sort(reads))
            counts = counts / float(counts[-1])
            AUC = np.sum(counts) / float(len(counts))
            XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
            elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
            expected = getExpected(np.mean(reads))  # A tuple of expected (AUC, XInt, elbow)
            of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2]))
            if args.JSDsample:
                JSD = getJSD(args, idx, num_reads_per_bin)
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                CHANCE = getCHANCE(args, idx, num_reads_per_bin)
                of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2]))
            else:
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                of.write("\t{0}".format(syntheticJSD))
            of.write("\n")
        of.close()
Beispiel #5
0
def main(args=None):
    args = process_args(args)

    if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics:
        sys.exit("At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n")

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 bedFile=bed_regions,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 bed_and_bin=True,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    if args.outCoverageMetrics and args.coverageThresholds:
        args.coverageThresholds.sort()  # Galaxy in particular tends to give things in a weird order
        of = open(args.outCoverageMetrics, "w")
        of.write("Sample\tThreshold\tPercent\n")
        nbins = float(num_reads_per_bin.shape[0])
        for thresh in args.coverageThresholds:
            vals = np.sum(num_reads_per_bin >= thresh, axis=0)
            for lab, val in zip(args.labels, vals):
                of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins))
        of.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    if args.plotFile:
        if args.plotFileFormat == 'plotly':
            fig = go.Figure()
            fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)'}
            fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)'}
            fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled'}
            fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage'}
            fig['layout'].update(title=args.plotTitle)
        else:
            fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight))
            plt.suptitle(args.plotTitle)

    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    data = []
    # We need to manually set the line colors so they're shared between the two plots.
    plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"]
    plotly_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], [])
    for idx, col in enumerate(num_reads_per_bin.T):
        if args.plotFile:
            frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
            csum = np.bincount(col.astype(int))[::-1].cumsum()
            csum_frac = csum.astype(float)[::-1] / csum.max()
            if args.plotFileFormat == 'plotly':
                color = plotly_colors[idx % len(plotly_colors)]
                dash = plotly_styles[idx % len(plotly_styles)]
                trace = go.Scatter(x=np.arange(0, int(x_max) - 1),
                                   y=frac_reads_per_coverage[:int(x_max)],
                                   mode='lines',
                                   xaxis='x1',
                                   yaxis='y1',
                                   line=dict(color=color, dash=dash),
                                   name="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]),
                                   legendgroup="{}".format(idx))
                data.append(trace)
                trace = go.Scatter(x=np.arange(0, int(x_max) - 1),
                                   y=csum_frac[:int(x_max)],
                                   mode='lines',
                                   xaxis='x2',
                                   yaxis='y2',
                                   line=dict(color=color, dash=dash),
                                   name=args.labels[idx],
                                   showlegend=False,
                                   legendgroup="{}".format(idx))
                data.append(trace)
            else:
                axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]))
                axs[1].plot(csum_frac, label=args.labels[idx])
            # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
            # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
            # then find the fraction of bases sampled that that have the largest x
            y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx],
                                                                  sample_mean[idx],
                                                                  sample_std[idx],
                                                                  sample_min[idx],
                                                                  sample_25[idx],
                                                                  sample_50[idx],
                                                                  sample_75[idx],
                                                                  sample_max[idx],
                                                                  ))

    if args.plotFile:
        # Don't clip plots
        y_max = max(y_max)
        if args.plotFileFormat == "plotly":
            fig['data'] = data
            fig['layout']['yaxis1'].update(range=[0.0, min(1, y_max + (y_max * 0.10))])
            fig['layout']['yaxis2'].update(range=[0.0, 1.0])
            py.plot(fig, filename=args.plotFile, auto_open=False)
        else:
            axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
            axs[0].set_xlim(0, x_max)
            axs[0].set_xlabel('coverage (#reads per bp)')
            axs[0].legend(fancybox=True, framealpha=0.5)
            axs[0].set_ylabel('fraction of bases sampled')
            # plot cumulative coverage
            axs[1].set_xlim(0, x_max)
            axs[1].set_xlabel('coverage (#reads per bp)')
            axs[1].set_ylabel('fraction of bases sampled >= coverage')
            axs[1].legend(fancybox=True, framealpha=0.5)
            plt.savefig(args.plotFile, format=args.plotFileFormat)
            plt.close()
def main(args=None):
    args = process_args(args)
    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    sys.stderr.write("Number of non zero bins "
                     "used: {}\n".format(num_reads_per_bin.shape[0]))

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(args.plotTitle)
    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    for idx, col in enumerate(num_reads_per_bin.T):
        frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
        axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]))
        csum = np.bincount(col.astype(int))[::-1].cumsum()
        csum_frac = csum.astype(float)[::-1] / csum.max()
        axs[1].plot(csum_frac, label=args.labels[idx])
        # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
        # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
        # then find the fraction of bases sampled that that have the largest x
        y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx],
                                                                  sample_mean[idx],
                                                                  sample_std[idx],
                                                                  sample_min[idx],
                                                                  sample_25[idx],
                                                                  sample_50[idx],
                                                                  sample_75[idx],
                                                                  sample_max[idx],
                                                                  ))

    # The 'good' x-axis is computed for each sample. The lower value is favored in which
    # distributions with a wider x-range can better be seen.
    y_max = min(y_max)
    axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
    axs[0].set_xlim(0, x_max)
    axs[0].set_xlabel('coverage (#reads per bp)')
    axs[0].legend(fancybox=True, framealpha=0.5)
    axs[0].set_ylabel('fraction of bases sampled')
    # plot cumulative coverage
    axs[1].set_xlim(0, x_max)
    axs[1].set_xlabel('coverage (#reads per bp)')
    axs[1].set_ylabel('fraction of bases sampled >= coverage')
    axs[1].legend(fancybox=True, framealpha=0.5)
    plt.savefig(args.plotFile, format=args.plotFileFormat)
    plt.close()
Beispiel #7
0
def main(args=None):
    args = process_args(args)

    if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics:
        sys.stderr.write(
            "\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n"
        )
        sys.exit(1)

    cr = sumR.SumCoveragePerBin(args.bamfiles,
                                args.binSize,
                                args.numberOfSamples,
                                blackListFileName=args.blackListFileName,
                                numberOfProcessors=args.numberOfProcessors,
                                verbose=args.verbose,
                                region=args.region,
                                extendReads=args.extendReads,
                                minMappingQuality=args.minMappingQuality,
                                ignoreDuplicates=args.ignoreDuplicates,
                                center_read=args.centerReads,
                                samFlag_include=args.samFlagInclude,
                                samFlag_exclude=args.samFlagExclude,
                                minFragmentLength=args.minFragmentLength,
                                maxFragmentLength=args.maxFragmentLength)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    if args.plotFile:
        i = 0
        # matplotlib won't iterate through line styles by itself
        pyplot_line_styles = sum(
            [7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], [])
        for i, reads in enumerate(num_reads_per_bin.T):
            count = np.cumsum(np.sort(reads))
            count = count / count[-1]  # to normalize y from 0 to 1
            j = i % 35
            plt.plot(x,
                     count,
                     label=args.labels[i],
                     linestyle=pyplot_line_styles[j])
            plt.xlabel('rank')
            plt.ylabel('fraction w.r.t. bin with highest coverage')
        plt.legend(loc='upper left')
        plt.suptitle(args.plotTitle)
        # set the plotFileFormat explicitly to None to trigger the
        # format from the file-extension
        if not args.plotFileFormat:
            args.plotFileFormat = None

        plt.savefig(args.plotFile.name,
                    bbox_inches=0,
                    format=args.plotFileFormat)
        plt.close()

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
        args.outRawCounts.close()

    if args.outQualityMetrics:
        args.outQualityMetrics.write(
            "Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point"
        )
        if args.JSDsample:
            args.outQualityMetrics.write(
                "\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence"
            )
        args.outQualityMetrics.write("\n")
        line = np.arange(
            num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
        for idx, reads in enumerate(num_reads_per_bin.T):
            counts = np.cumsum(np.sort(reads))
            counts = counts / float(counts[-1])
            AUC = np.sum(counts) / float(len(counts))
            XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
            elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
            expected = getExpected(
                np.mean(reads))  # A tuple of expected (AUC, XInt, elbow)
            args.outQualityMetrics.write(
                "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(
                    args.labels[idx], AUC, expected[0], XInt, expected[1],
                    elbow, expected[2]))
            if args.JSDsample:
                JSD = getJSD(args, idx, num_reads_per_bin)
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                CHANCE = getCHANCE(args, idx, num_reads_per_bin)
                args.outQualityMetrics.write(
                    "\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD,
                                                       CHANCE[0], CHANCE[1],
                                                       CHANCE[2]))
            args.outQualityMetrics.write("\n")
        args.outQualityMetrics.close()
Beispiel #8
0
def main(args=None):
    args = process_args(args)
    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    sys.stderr.write("Number of non zero bins "
                     "used: {}\n".format(num_reads_per_bin.shape[0]))

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(args.plotTitle)
    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    for idx, col in enumerate(num_reads_per_bin.T):
        frac_reads_per_coverage = np.bincount(
            col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
        axs[0].plot(frac_reads_per_coverage,
                    label="{}, mean={:.1f}".format(args.labels[idx],
                                                   sample_mean[idx]))
        csum = np.bincount(col.astype(int))[::-1].cumsum()
        csum_frac = csum.astype(float)[::-1] / csum.max()
        axs[1].plot(csum_frac, label=args.labels[idx])
        # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
        # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
        # then find the fraction of bases sampled that that have the largest x
        y_max.append(frac_reads_per_coverage[max(
            np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(
            args.labels[idx],
            sample_mean[idx],
            sample_std[idx],
            sample_min[idx],
            sample_25[idx],
            sample_50[idx],
            sample_75[idx],
            sample_max[idx],
        ))

    # The 'good' x-axis is computed for each sample. The lower value is favored in which
    # distributions with a wider x-range can better be seen.
    y_max = min(y_max)
    axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
    axs[0].set_xlim(0, x_max)
    axs[0].set_xlabel('coverage (#reads per bp)')
    axs[0].legend(fancybox=True, framealpha=0.5)
    axs[0].set_ylabel('fraction of bases sampled')
    # plot cumulative coverage
    axs[1].set_xlim(0, x_max)
    axs[1].set_xlabel('coverage (#reads per bp)')
    axs[1].set_ylabel('fraction of bases sampled >= coverage')
    axs[1].legend(fancybox=True, framealpha=0.5)
    plt.savefig(args.plotFile, format=args.plotFileFormat)
    plt.close()
def main(args=None):
    args = process_args(args)

    cr = sumR.SumCoveragePerBin(
        args.bamfiles,
        args.binSize,
        args.numberOfSamples,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude,
        minFragmentLength=args.minFragmentLength,
        maxFragmentLength=args.maxFragmentLength)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    i = 0
    # matplotlib won't iterate through line styles by itself
    pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"], 7 * ["."]], [])
    for i, reads in enumerate(num_reads_per_bin.T):
        count = np.cumsum(np.sort(reads))
        count = count / count[-1]  # to normalize y from 0 to 1
        j = i % 35
        plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j])
        plt.xlabel('rank')
        plt.ylabel('fraction w.r.t. bin with highest coverage')
    plt.legend(loc='upper left')
    plt.suptitle(args.plotTitle)
    # set the plotFileFormat explicitly to None to trigger the
    # format from the file-extension
    if not args.plotFileFormat:
        args.plotFileFormat = None

    plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat)
    plt.close()

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
        args.outRawCounts.close()

    if args.outQualityMetrics:
        args.outQualityMetrics.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point")
        if args.JSDsample:
            args.outQualityMetrics.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence")
        args.outQualityMetrics.write("\n")
        line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
        for idx, reads in enumerate(num_reads_per_bin.T):
            counts = np.cumsum(np.sort(reads))
            counts = counts / float(counts[-1])
            AUC = np.sum(counts) / float(len(counts))
            XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
            elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
            expected = getExpected(np.mean(reads))  # A tuple of expected (AUC, XInt, elbow)
            args.outQualityMetrics.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2]))
            if args.JSDsample:
                JSD = getJSD(args, idx, num_reads_per_bin)
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                CHANCE = getCHANCE(args, idx, num_reads_per_bin)
                args.outQualityMetrics.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2]))
            args.outQualityMetrics.write("\n")
        args.outQualityMetrics.close()
Beispiel #10
0
def main(args=None):
    args = process_args(args)

    if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics:
        sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n")
        sys.exit(1)

    cr = sumR.SumCoveragePerBin(
        args.bamfiles,
        args.binSize,
        args.numberOfSamples,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude,
        minFragmentLength=args.minFragmentLength,
        maxFragmentLength=args.maxFragmentLength)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "For small genomes, decrease the --numberOfSamples.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    if args.plotFile is not None:
        i = 0
        # matplotlib won't iterate through line styles by itself
        pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], [])
        plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"]
        plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], [])
        data = []
        for i, reads in enumerate(num_reads_per_bin.T):
            count = np.cumsum(np.sort(reads))
            count = count / count[-1]  # to normalize y from 0 to 1
            if args.plotFileFormat == 'plotly':
                trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i])
                trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6])
                data.append(trace)
            else:
                j = i % len(pyplot_line_styles)
                plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j])
                plt.xlabel('rank')
                plt.ylabel('fraction w.r.t. bin with highest coverage')
        # set the plotFileFormat explicitly to None to trigger the
        # format from the file-extension
        if not args.plotFileFormat:
            args.plotFileFormat = None

        if args.plotFileFormat == 'plotly':
            fig = go.Figure()
            fig['data'] = data
            fig['layout'].update(title=args.plotTitle)
            fig['layout']['xaxis1'].update(title="rank")
            fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage")
            py.plot(fig, filename=args.plotFile, auto_open=False)
        else:
            plt.legend(loc='upper left')
            plt.suptitle(args.plotTitle)
            plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat)
            plt.close()

    if args.outRawCounts is not None:
        of = open(args.outRawCounts, "w")
        of.write("#plotFingerprint --outRawCounts\n")
        of.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            of.write(fmt % tuple(row))
        of.close()

    if args.outQualityMetrics is not None:
        of = open(args.outQualityMetrics, "w")
        of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point")
        if args.JSDsample:
            of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence")
        else:
            of.write("\tSynthetic JS Distance")
        of.write("\n")
        line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
        for idx, reads in enumerate(num_reads_per_bin.T):
            counts = np.cumsum(np.sort(reads))
            counts = counts / float(counts[-1])
            AUC = np.sum(counts) / float(len(counts))
            XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
            elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
            expected = getExpected(np.mean(reads))  # A tuple of expected (AUC, XInt, elbow)
            of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2]))
            if args.JSDsample:
                JSD = getJSD(args, idx, num_reads_per_bin)
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                CHANCE = getCHANCE(args, idx, num_reads_per_bin)
                of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2]))
            else:
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                of.write("\t{0}".format(syntheticJSD))
            of.write("\n")
        of.close()
Beispiel #11
0
def main(args=None):
    args = process_args(args)
    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude)

    num_reads_per_bin = cr.run()

    sys.stderr.write("Number of non zero bins "
                     "used: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(args.plotTitle)
    # plot up to two std from mean
    sample_mean = num_reads_per_bin.mean(axis=0)
    std = max(num_reads_per_bin.std(axis=0))
    y_max = max(sample_mean) + 3 * std

    # plot coverage
    for idx, col in enumerate(num_reads_per_bin.T):
        axs[0].plot(np.bincount(col.astype(int)).astype(float) /
                    num_reads_per_bin.shape[0],
                    label="{}, mean={:.1f}".format(args.labels[idx],
                                                   sample_mean[idx]))
        csum = np.bincount(col.astype(int))[::-1].cumsum()
        axs[1].plot(csum.astype(float)[::-1] / csum.max(),
                    label=args.labels[idx])

    axs[0].set_xlim(0, y_max)
    axs[0].set_xlabel('coverage')
    axs[0].legend()
    axs[0].set_ylabel('fraction of bases sampled')
    # plot cumulative coverage
    axs[1].set_xlim(0, y_max)
    axs[1].set_xlabel('coverage')
    axs[1].set_ylabel('fraction of bases sampled >= coverage')
    axs[1].legend()
    plt.savefig(args.plotFile.name, format=args.plotFileFormat)