def _get_att_n_50_contig_length(read_lengths):
    """
    Get the n50 or 0 if n50 cannot be calculated
    :param read_lengths: sorted list
    """
    n50 = compute_n50(read_lengths)
    return Attribute(Constants.A_N50_LEN, int(n50))
Exemple #2
0
def _get_att_n_50_contig_length(read_lengths):
    """
    Get the n50 or 0 if n50 cannot be calculated
    :param read_lengths: sorted list
    """
    n50 = compute_n50(read_lengths)
    return Attribute(Constants.A_N50_LEN, int(n50))
 def test_compute_n50_from_readlenths(self):
     """
     Test getting N50 from length list
     """
     fasta = os.path.join( ROOT_DATA_DIR, 'polished_assembly', 'polished_assembly.fasta.gz')
     l = get_fasta_readlengths(fasta)
     self.assertEqual( 33586, compute_n50(l) )
Exemple #4
0
 def test_compute_n50_from_readlenths(self):
     """
     Test getting N50 from length list
     """
     fasta = os.path.join(ROOT_DATA_DIR, 'polished_assembly',
                          'polished_assembly.fasta.gz')
     l = get_fasta_readlengths(fasta)
     self.assertEqual(33586, compute_n50(l))
Exemple #5
0
def _to_read_stats_attributes(readLenDists, readQualDists):
    # Build the stats table:
    nbases = 0
    nreads = 0
    n50 = 0
    readscoretotal = 0
    readscorenumber = 0
    approx_read_lens = []

    # if a merge failed there may be more than one dist:
    for rlendist in readLenDists:
        nbases += rlendist.sampleMean * rlendist.sampleSize
        nreads += rlendist.sampleSize

        # N50:
        for i, lbin in enumerate(rlendist.bins):
            # use the average, except for the last bin
            if i != len(rlendist.bins) - 1:
                value = ((i * rlendist.binWidth) + rlendist.minBinValue +
                         rlendist.binWidth / 2)
            # for the last bin, just use the value
            else:
                value = (i * rlendist.binWidth) + rlendist.minBinValue
            approx_read_lens.extend([value] * lbin)
            # TODO(mdsmith)(2016-02-09) make sure maxOutlierValue is updated
            # during a merge /todo
            # but pop off that last value and replace it with the
            # maxOutlierValue:
            # approx_read_lens.pop()
            # approx_read_lens.append(rlendist.maxBinValue)
    n50 = int(np.round(compute_n50(approx_read_lens), decimals=0))
    for rqualdist in readQualDists:
        readscoretotal += _total_from_bins(rqualdist.bins,
                                           rqualdist.minBinValue,
                                           rqualdist.binWidth)
        readscorenumber += sum(rqualdist.bins)

    readlen = 0
    if nreads != 0:
        readlen = nbases / nreads
    readlen = int(np.round(readlen, decimals=0))
    readQuality = 0
    if readscorenumber != 0:
        readQuality = np.round(readscoretotal / readscorenumber, decimals=2)
    return [
        int(np.round(nbases, decimals=0)),
        nreads,
        readlen,
        n50,
    ]  #readQuality]
def _to_read_stats_attributes(readLenDists, readQualDists):
    # Build the stats table:
    nbases = 0
    nreads = 0
    n50 = 0
    readscoretotal = 0
    readscorenumber = 0
    approx_read_lens = []

    # if a merge failed there may be more than one dist:
    for rlendist in readLenDists:
        nbases += rlendist.sampleMean * rlendist.sampleSize
        nreads += rlendist.sampleSize

        # N50:
        for i, lbin in enumerate(rlendist.bins):
            # use the average, except for the last bin
            if i != len(rlendist.bins) - 1:
                value = ((i * rlendist.binWidth) + rlendist.minBinValue +
                         rlendist.binWidth / 2)
            # for the last bin, just use the value
            else:
                value = (i * rlendist.binWidth) + rlendist.minBinValue
            approx_read_lens.extend([value] * lbin)
            # TODO(mdsmith)(2016-02-09) make sure maxOutlierValue is updated
            # during a merge /todo
            # but pop off that last value and replace it with the
            # maxOutlierValue:
            # approx_read_lens.pop()
            # approx_read_lens.append(rlendist.maxBinValue)
    n50 = int(np.round(compute_n50(approx_read_lens), decimals=0))
    for rqualdist in readQualDists:
        readscoretotal += _total_from_bins(rqualdist.bins,
                                           rqualdist.minBinValue,
                                           rqualdist.binWidth)
        readscorenumber += sum(rqualdist.bins)

    readlen = 0
    if nreads != 0:
        readlen = nbases / nreads
    readlen = int(np.round(readlen, decimals=0))
    readQuality = 0
    if readscorenumber != 0:
        readQuality = readscoretotal / readscorenumber
    return [int(np.round(nbases, decimals=0)),
            nreads,
            readlen,
            n50,
            ]  # readQuality]
Exemple #7
0
 def n50(self):
     return compute_n50(self.values)
Exemple #8
0
 def test_02_compute_n50(self):
     x = [91, 77, 70, 69, 62, 56, 45, 29, 16, 4]
     n = compute_n50(x)
     self.assertEqual(n, 69)
Exemple #9
0
 def test_01_compute_n50(self):
     x = [6, 5, 4]
     n = compute_n50(x)
     self.assertEqual(n, 5)
Exemple #10
0
 def n50(self):
     return compute_n50(self.values)
def to_report(stats_xml, output_dir, dpi=72):
    """Main point of entry

    :type stats_xml: str
    :type output_dir: str
    :type dpi: int

    :rtype: Report
    """
    log.info("Analyzing XML {f}".format(f=stats_xml))
    # stats_xml should be a dataset:
    dset = DataSet(stats_xml)
    dataset_uuids = [dset.uuid]
    # but if it isn't, no problem:
    if not dset.metadata.summaryStats:
        dset.loadStats(stats_xml)
        # an sts file was provided which will generate a new random uuid
        dataset_uuids = []
    if not dset.metadata.summaryStats.readLenDists:
        raise RuntimeError("No Pipeline Summary Stats (sts.xml) found")

    # Build the stats table:
    nbases = 0
    nreads = 0
    n50 = 0
    readscoretotal = 0
    readscorenumber = 0
    approx_read_lens = []

    # if a merge failed there may be more than one dist:
    for rlendist in dset.metadata.summaryStats.readLenDists:
        nbases += _total_from_bins(rlendist.bins,
                                   rlendist.minBinValue,
                                   rlendist.binWidth)
        nreads += sum(rlendist.bins)

        # N50:
        for i, lbin in enumerate(rlendist.bins):
            # use the average, except for the last bin
            if i != len(rlendist.bins) - 1:
                value = ((i * rlendist.binWidth) + rlendist.minBinValue +
                         rlendist.binWidth / 2)
            # for the last bin, just use the value
            else:
                value = (i * rlendist.binWidth) + rlendist.minBinValue
            approx_read_lens.extend([value] * lbin)
            # TODO(mdsmith)(2016-02-09) make sure maxOutlierValue is updated
            # during a merge /todo
            # but pop off that last value and replace it with the
            # maxOutlierValue:
            # approx_read_lens.pop()
            # approx_read_lens.append(rlendist.maxBinValue)
    n50 = np.round(compute_n50(approx_read_lens))

    for rqualdist in dset.metadata.summaryStats.readQualDists:
        readscoretotal += _total_from_bins(rqualdist.bins,
                                           rqualdist.minBinValue,
                                           rqualdist.binWidth)
        readscorenumber += sum(rqualdist.bins)

    readlen = 0
    if nreads != 0:
        readlen = np.round(nbases / nreads, decimals=2)
    readQuality = 0
    if readscorenumber != 0:
        readQuality = np.round(readscoretotal / readscorenumber, decimals=2)
    row_names = ["Polymerase Read Bases",
                 "Polymerase Reads",
                 "Polymerase Read N50",
                 "Polymerase Read Length",
                 "Polymerase Read Quality"]
    _pre_filter = [np.round(nbases, decimals=2),
                   nreads,
                   n50,
                   readlen,
                   readQuality]

    plots = []

    # ReadLen distribution to barplot:
    for i, rlendist in enumerate(dset.metadata.summaryStats.readLenDists):
        len_fig, len_axes = get_fig_axes_lpr()
        len_axes.bar(rlendist.labels, rlendist.bins,
                     color=get_green(0), edgecolor=get_green(0),
                     width=(rlendist.binWidth * 0.75))
        len_axes.set_xlabel('Read Length')
        len_axes.set_ylabel('Reads')
        png_fn = os.path.join(output_dir, "readLenDist{i}.png".format(i=i))
        png_base, thumbnail_base = save_figure_with_thumbnail(len_fig, png_fn,
                                                              dpi=dpi)

        plots.append(Plot("filter_len_xml_plot_{i}".format(i=i),
                          os.path.relpath(png_base, output_dir),
                          thumbnail=os.path.relpath(thumbnail_base, output_dir)))

    plot_groups = [PlotGroup("filter_len_xml_plot_group",
                             title="Polymerase Read Length",
                             plots=plots,
                             thumbnail=os.path.relpath(thumbnail_base, output_dir))]

    plots = []

    # ReadQual distribution to barplot:
    for i, rqualdist in enumerate(dset.metadata.summaryStats.readQualDists):
        qual_fig, qual_axes = get_fig_axes_lpr()
        qual_axes.bar(rqualdist.labels, rqualdist.bins,
                      color=get_green(0), edgecolor=get_green(0),
                      width=(rqualdist.binWidth * 0.75))
        qual_axes.set_xlabel('Read Quality')
        qual_axes.set_ylabel('Reads')

        png_fn = os.path.join(output_dir, "readQualDist{i}.png".format(i=i))
        png_base, thumbnail_base = save_figure_with_thumbnail(qual_fig, png_fn,
                                                              dpi=dpi)

        plots.append(Plot("filter_qual_xml_plot_{i}".format(i=i),
                          os.path.relpath(png_base, output_dir),
                          thumbnail=os.path.relpath(thumbnail_base, output_dir)))

    plot_groups.append(PlotGroup("filter_qual_xml_plot_group",
                                 title="Polymerase Read Quality",
                                 plots=plots))

    # build the report:
    columns = [Column("filter_names_column", header="Metrics",
                      values=row_names)]
    columns.append(Column("filter_stats_column", header="Values",
                          values=_pre_filter))

    tables = [Table("filter_xml_table", "Filtering Statistics", columns)]

    report = Report("filtering_stats_xml_report",
                    title="Filtering stats XML report",
                    tables=tables,
                    attributes=None,
                    plotgroups=plot_groups,
                    dataset_uuids=dataset_uuids)

    return report
 def test_02_compute_n50(self):
     x = [91, 77, 70, 69, 62, 56, 45, 29, 16, 4]
     n = compute_n50(x)
     self.assertEqual(n, 69)
 def test_01_compute_n50(self):
     x = [6, 5, 4]
     n = compute_n50(x)
     self.assertEqual(n, 5)