def loadStatsXml(args): dset = DataSet(args.infile, strict=args.strict) dset.loadStats(args.statsfile) if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False)
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_ids = [ Constants.C_CONTEXT, Constants.C_ZMWS, Constants.C_PROD_0, Constants.C_PROD_1, Constants.C_PROD_2 ] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [ Column(cid, values=vals) for cid, vals in zip(col_ids, col_values) ] tables = [Table(Constants.T_LOADING, columns=columns)] report = Report(meta_rpt.id, title=meta_rpt.title, tables=tables, attributes=None, plotgroups=None) return meta_rpt.apply_view(report)
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_names = ["Collection Context", "Productive ZMWs", "Productivity 0 (%)", "Productivity 1 (%)", "Productivity 2 (%)"] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [Column(cn.translate(None, '(%)').strip().replace(' ', '_').lower(), cn, vals) for cn, vals in zip(col_names, col_values)] tables = [Table("loading_xml_table", "Loading Statistics", columns)] report = Report("loading_xml_report", title="Loading Report", tables=tables, attributes=None, plotgroups=None) return report
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_ids = [Constants.C_CONTEXT, Constants.C_ZMWS, Constants.C_PROD_0, Constants.C_PROD_1, Constants.C_PROD_2] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [Column(cid, values=vals) for cid, vals in zip(col_ids, col_values)] tables = [Table(Constants.T_LOADING, columns=columns)] report = Report(Constants.R_ID, tables=tables, attributes=None, plotgroups=None) return spec.apply_view(report)
def to_report(stats_xml, output_dir, dpi=72): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) # stats_xml should be a dataset: dset = DataSet(stats_xml) dataset_uuids = [dset.uuid] # but if it isn't, no problem: if not dset.metadata.summaryStats: dset.loadStats(stats_xml) # an sts file was provided which will generate a new random uuid dataset_uuids = [] if not dset.metadata.summaryStats.readLenDists: raise RuntimeError("No Pipeline Summary Stats (sts.xml) found") # Build the stats table: nbases = 0 nreads = 0 n50 = 0 readscoretotal = 0 readscorenumber = 0 approx_read_lens = [] # if a merge failed there may be more than one dist: for rlendist in dset.metadata.summaryStats.readLenDists: nbases += _total_from_bins(rlendist.bins, rlendist.minBinValue, rlendist.binWidth) nreads += sum(rlendist.bins) # N50: for i, lbin in enumerate(rlendist.bins): # use the average, except for the last bin if i != len(rlendist.bins) - 1: value = ((i * rlendist.binWidth) + rlendist.minBinValue + rlendist.binWidth / 2) # for the last bin, just use the value else: value = (i * rlendist.binWidth) + rlendist.minBinValue approx_read_lens.extend([value] * lbin) # TODO(mdsmith)(2016-02-09) make sure maxOutlierValue is updated # during a merge /todo # but pop off that last value and replace it with the # maxOutlierValue: # approx_read_lens.pop() # approx_read_lens.append(rlendist.maxBinValue) n50 = np.round(compute_n50(approx_read_lens)) for rqualdist in dset.metadata.summaryStats.readQualDists: readscoretotal += _total_from_bins(rqualdist.bins, rqualdist.minBinValue, rqualdist.binWidth) readscorenumber += sum(rqualdist.bins) readlen = 0 if nreads != 0: readlen = np.round(nbases / nreads, decimals=2) readQuality = 0 if readscorenumber != 0: readQuality = np.round(readscoretotal / readscorenumber, decimals=2) row_names = ["Polymerase Read Bases", "Polymerase Reads", "Polymerase Read N50", "Polymerase Read Length", "Polymerase Read Quality"] _pre_filter = [np.round(nbases, decimals=2), nreads, n50, readlen, readQuality] plots = [] # ReadLen distribution to barplot: for i, rlendist in enumerate(dset.metadata.summaryStats.readLenDists): len_fig, len_axes = get_fig_axes_lpr() len_axes.bar(rlendist.labels, rlendist.bins, color=get_green(0), edgecolor=get_green(0), width=(rlendist.binWidth * 0.75)) len_axes.set_xlabel('Read Length') len_axes.set_ylabel('Reads') png_fn = os.path.join(output_dir, "readLenDist{i}.png".format(i=i)) png_base, thumbnail_base = save_figure_with_thumbnail(len_fig, png_fn, dpi=dpi) plots.append(Plot("filter_len_xml_plot_{i}".format(i=i), os.path.relpath(png_base, output_dir), thumbnail=os.path.relpath(thumbnail_base, output_dir))) plot_groups = [PlotGroup("filter_len_xml_plot_group", title="Polymerase Read Length", plots=plots, thumbnail=os.path.relpath(thumbnail_base, output_dir))] plots = [] # ReadQual distribution to barplot: for i, rqualdist in enumerate(dset.metadata.summaryStats.readQualDists): qual_fig, qual_axes = get_fig_axes_lpr() qual_axes.bar(rqualdist.labels, rqualdist.bins, color=get_green(0), edgecolor=get_green(0), width=(rqualdist.binWidth * 0.75)) qual_axes.set_xlabel('Read Quality') qual_axes.set_ylabel('Reads') png_fn = os.path.join(output_dir, "readQualDist{i}.png".format(i=i)) png_base, thumbnail_base = save_figure_with_thumbnail(qual_fig, png_fn, dpi=dpi) plots.append(Plot("filter_qual_xml_plot_{i}".format(i=i), os.path.relpath(png_base, output_dir), thumbnail=os.path.relpath(thumbnail_base, output_dir))) plot_groups.append(PlotGroup("filter_qual_xml_plot_group", title="Polymerase Read Quality", plots=plots)) # build the report: columns = [Column("filter_names_column", header="Metrics", values=row_names)] columns.append(Column("filter_stats_column", header="Values", values=_pre_filter)) tables = [Table("filter_xml_table", "Filtering Statistics", columns)] report = Report("filtering_stats_xml_report", title="Filtering stats XML report", tables=tables, attributes=None, plotgroups=plot_groups, dataset_uuids=dataset_uuids) return report
def test_stats_metadata(self): ds = DataSet(data.getBam()) ds.loadStats(data.getStats()) self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4) self.assertEqual(ds.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds3 = ds1 + ds2 self.assertEqual(ds1.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds2.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds3.metadata.summaryStats.prodDist.bins, [3152, 1802, 798, 0]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # Lets check some manual values ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 20 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1]) # now lets swap ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 20 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 10 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1]) # now lets do some non-overlapping ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [1, 1, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [2, 2, 2]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2]) ds2.metadata.summaryStats.readLenDist.minBinValue = 50 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [1, 1, 1, 0, 2, 2, 2]) # now lets test the subdataset metadata retention: ss = SubreadSet(data.getXml(10)) ss.loadStats(data.getStats(0)) ss.loadStats(data.getStats(1)) self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws) self.assertEqual( 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws) self.assertEqual( 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
def to_report(stats_xml, output_dir, dpi=72): #TODO: make dpi matter """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.medianInsertDists: raise RuntimeError("No Pipeline Summary Stats (sts.xml) found") # Pull some stats: adapter_dimers = np.round( 100.0 * dset.metadata.summaryStats.adapterDimerFraction, decimals=2) short_inserts = np.round( 100.0 * dset.metadata.summaryStats.shortInsertFraction, decimals=2) plots = [] # Pull some histograms (may have dupes (unmergeable distributions)): for i, ins_len_dist in enumerate( dset.metadata.summaryStats.medianInsertDists): # make a bar chart: fig, ax = get_fig_axes_lpr() ax.bar(map(float, ins_len_dist.labels), ins_len_dist.bins, color=get_green(0), edgecolor=get_green(0), width=(ins_len_dist.binWidth * 0.75)) ax.set_xlabel('Median Distance Between Adapters') ax.set_ylabel('Reads') png_fn = os.path.join(output_dir, "interAdapterDist{i}.png".format(i=i)) png_base, thumbnail_base = save_figure_with_thumbnail(fig, png_fn, dpi=dpi) # build the report: plots.append(Plot("adapter_xml_plot_{i}".format(i=i), os.path.relpath(png_base), thumbnail=os.path.relpath(thumbnail_base))) plot_groups = [PlotGroup("adapter_xml_plot_group", title="Observed Insert Length Distribution", plots=plots, thumbnail=os.path.relpath(thumbnail_base))] columns = [Column("adaper_xml_conditions", None, ('Adapter Dimers (0-10bp)', 'Short Inserts (11-100bp)')), Column("adaper_xml_results", None, (adapter_dimers, short_inserts))] tables = [Table("adapter_xml_table", "Adapter Statistics", columns)] report = Report("adapter_xml_report", title="Adapter Report", tables=tables, attributes=None, plotgroups=plot_groups) return report
def test_stats_metadata(self): ds = DataSet(data.getBam()) ds.loadStats(data.getStats()) self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4) self.assertEqual(ds.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds3 = ds1 + ds2 self.assertEqual(ds1.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds2.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds3.metadata.summaryStats.prodDist.bins, [3152, 1802, 798, 0]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [ 0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [ 0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [ 0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) # Lets check some manual values ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([ 0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1 ]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([ 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 20 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1]) # now lets swap ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([ 0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1 ]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 20 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([ 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1 ]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 10 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1]) # now lets do some non-overlapping ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ([1, 1, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ([2, 2, 2]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2]) ds2.metadata.summaryStats.readLenDist.minBinValue = 50 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [1, 1, 1, 0, 2, 2, 2]) # now lets test the subdataset metadata retention: ss = SubreadSet(data.getXml(10)) ss.loadStats(data.getStats(0)) ss.loadStats(data.getStats(1)) self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws) self.assertEqual( 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws) self.assertEqual( 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)