def run_after(self, rtc, output_dir): report_file = rtc.task.output_files[0] r = load_report_from_json(report_file) a = {a.id: a.value for a in r.attributes} n = {a.id: a.name for a in r.attributes} self.assertEqual( a, { "mock_attr_2": 4, "mock_attr_3": 6, "mock_attr_0": 0, "mock_attr_1": 2, "mock_attr_4": 8 }) self.assertEqual( n, { "mock_attr_2": "Attr 2", "mock_attr_3": "Attr 3", "mock_attr_0": "Attr 0", "mock_attr_1": "Attr 1", "mock_attr_4": "Attr 4" }) keys = [str(a.id) for a in r.attributes] # check attribute order self.assertEqual(keys, [ "mock_attr_0", "mock_attr_1", "mock_attr_2", "mock_attr_3", "mock_attr_4" ])
def run_after(self, rtc, output_dir): r = load_report_from_json(rtc.task.output_files[0]) attr = {a.id: a.value for a in r.attributes} self.assertEqual(attr, { "num_polished_hq_isoforms": 11701, "num_polished_lq_isoforms": 44 })
def _report_to_attributes(report_file): report = load_report_from_json(report_file) attr = { a.id:a.value for a in report.attributes } if attr.get("isoseq_numconsensusisoforms", 0) > 0: avg = attr["isoseq_numtotalbases"]/attr["isoseq_numconsensusisoforms"] report.attributes.append(Attribute("isoseq_average_consensus_isoform_length", avg, name="Average consensus isoform length")) return report.attributes
def test_exit_code_0(self): bam = self.getAlignmentSet() var_rpt = os.path.join(DATA, 'variants_report.json') mapping_rpt = os.path.join(DATA, 'mapping_stats_report.json') cmd = 'python -m pbreports.report.sat {o} {r} {c} {a} {v}'.format(o=self._output_dir, r='rpt.json', c=bam, a=var_rpt, v=mapping_rpt) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) rpt_file = os.path.join(self._output_dir, 'rpt.json') rpt = load_report_from_json(rpt_file) self.assertEqual('sidney', rpt.get_attribute_by_id('instrument').value) self.assertEqual(1, rpt.get_attribute_by_id('coverage').value) self.assertEqual(1, rpt.get_attribute_by_id('concordance').value) self.assertEqual(7752, rpt.get_attribute_by_id( 'mapped_readlength_mean').value) self.assertEqual(48, rpt.get_attribute_by_id('reads_in_cell').value) out = StringIO() self.assertTrue(summarize_report(rpt_file, out=out))
def test_exit_code_0(self): bam = self.getAlignmentSet() var_rpt = os.path.join(DATA, 'variants_report.json') mapping_rpt = os.path.join(DATA, 'mapping_stats_report.json') cmd = 'python -m pbreports.report.sat {o} {r} {c} {a} {v}'.format(o=self._output_dir, r='rpt.json', c=bam, a=var_rpt, v=mapping_rpt) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) rpt_file = os.path.join(self._output_dir, 'rpt.json') rpt = load_report_from_json(rpt_file) self.assertEqual('sidney', rpt.get_attribute_by_id('instrument').value) self.assertEqual(1, rpt.get_attribute_by_id('coverage').value) self.assertEqual(1, rpt.get_attribute_by_id('accuracy').value) self.assertEqual(1328, rpt.get_attribute_by_id('mapped_readlength_mean').value) self.assertEqual(48, rpt.get_attribute_by_id('reads_in_cell').value) out = StringIO() self.assertTrue(summarize_report(rpt_file, out=out))
def run_after(self, rtc, output_dir): rpt = load_report_from_json(rtc.task.output_files[0]) for a in rpt.attributes: if a.id == "avg_flnc_len": self.assertEqual(a.value, 1142) break else: self.fail("avg_flnc_len not found")
def summarize_report(report_file, out=sys.stdout): report = load_report_from_json(report_file) attr = {a.id: a.value for a in report.attributes} coverage = attr[Constants.A_COVERAGE] concordance = attr[Constants.A_CONCORDANCE] out.write("%s:\n" % report_file) out.write(" {n}: {a}\n".format(n=meta_rpt.get_meta_attribute(Constants.A_CONCORDANCE).name,a=concordance)) out.write(" {n}: {c}\n".format(n=meta_rpt.get_meta_attribute(Constants.A_COVERAGE).name,c=coverage)) return coverage == 1 and concordance == 1
def _validate_against_spec(path): if self._specs is None: raise unittest.SkipTest("Can't find report specs.") rpt = load_report_from_json(path) spec = self._specs.get(rpt.id, None) if spec is None: self.fail("No spec found for report {r}".format(r=rpt.id)) else: return spec.validate_report(rpt)
def summarize_report(report_file, out=sys.stdout): report = load_report_from_json(report_file) attr = {a.id: a.value for a in report.attributes} coverage = attr["coverage"] accuracy = attr["accuracy"] out.write("%s:\n" % report_file) out.write(" CONSENSUS ACCURACY: {a}\n".format(a=attr["accuracy"])) out.write(" CONSENSUS COVERAGE: {c}\n".format(c=attr["coverage"])) return coverage == 1 and accuracy == 1
def _get_mapping_stats_data(mapping_stats_rpt_file): """ Extract attributes from the mapping stats report. :param mapping_stats_rpt_file: (str) path to the mapping stats report :return dict: mean mapped read length """ rpt = load_report_from_json(mapping_stats_rpt_file) rl = rpt.get_attribute_by_id(Constants.M_READLENGTH).value return {Constants.M_READLENGTH: rl}
def summarize_report(report_file, out=sys.stdout): report = load_report_from_json(report_file) attr = {a.id: a for a in report.attributes} coverage = attr[Constants.A_COVERAGE] concordance = attr[Constants.A_CONCORDANCE] out.write("{f}:\n".format(f=report_file)) out.write(" {n}: {v}\n".format(n=concordance.name, v=concordance.value)) out.write(" {n}: {v}\n".format(n=coverage.name, v=coverage.value)) return coverage.value == 1 and concordance.value == 1
def makeReport(inReadsFN, inSummaryFN, outDir): """ Generate a report with ID, tables, attributes and plot groups. inReadsFN --- an input FASTA file which has all consensus isoforms produced by pbtranscript.py cluster. This file is required to plot a read length histogram as part of the report: consensus_isoforms_readlength_hist.png inSummaryFN --- a summary TXT file with cluster attributes, including two attributes: number of consensus isoforms average length of consensus isoforms Attributes of the report are extracted from this file. """ log.info("Plotting read length histogram from file: {f}". format(f=inReadsFN)) # Collect read lengths of reader = ContigSet(inReadsFN) rs = [len(r.sequence) for r in reader] reader.close() readlengths = np.array(rs) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, outDir) readlength_group = PlotGroup(Constants.PG_READLENGTH, title="Read Length of Consensus Isoforms Reads", plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=inSummaryFN)) # Produce attributes based on summary. dataset_uuids = [ContigSet(inReadsFN).uuid] if inSummaryFN.endswith(".json"): attributes = _report_to_attributes(inSummaryFN) r = load_report_from_json(inSummaryFN) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? else: attributes = summaryToAttributes(inSummaryFN) table = attributesToTable(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, title="Transcript Clustering", attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return report
def gather_report(json_files, output_file): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [ load_report_from_json(fn) for fn in json_files ] merged = Report.merge(reports) with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def gather_report(json_files, output_file): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [load_report_from_json(fn) for fn in json_files] merged = Report.merge(reports) with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def _get_variants_data(variants_rpt_file): """ Extract attributes from the variants report. :param variants_rpt_file: (str) path to the variants report :return dict: coverage and concordance """ rpt = load_report_from_json(variants_rpt_file) coverage = rpt.get_attribute_by_id(Constants.V_COVERAGE).value concordance = rpt.get_attribute_by_id(Constants.V_CONCORDANCE).value return {Constants.A_COVERAGE: coverage, Constants.A_CONCORDANCE: concordance}
def test_make_report(self): tmpdir = tempfile.mkdtemp() r = pbreports.report.isoseq_classify.make_report( self.input_fasta, self.output_summary_json, tmpdir) j = r.to_json() attr = {a.id: a.value for a in r.attributes} r2 = load_report_from_json(self.output_summary_json) attr2 = {a.id: a.value for a in r2.attributes} for k, v in attr2.iteritems(): self.assertEqual(attr[k], v)
def _get_variants_data(variants_rpt_file): """ Extract attributes from the variants report. :param variants_rpt_file: (str) path to the variants report :return dict: coverage and accuracy """ rpt = load_report_from_json(variants_rpt_file) coverage = rpt.get_attribute_by_id('weighted_mean_bases_called').value accuracy = rpt.get_attribute_by_id('weighted_mean_concordance').value return {'coverage': coverage, 'accuracy': accuracy}
def run_after(self, rtc, output_dir): rpt = None uuids = [] for file_name in rtc.task.output_files: if file_name.endswith(".json"): rpt = load_report_from_json(file_name) elif file_name.endswith(".xml"): uuids.append(ContigSet(file_name, strict=True).uuid) else: assert file_name.endswith(".csv") self.assertEqual(sorted(rpt._dataset_uuids), sorted(uuids))
def makeReport(inReadsFN, inSummaryFN, outDir): """ Generate a report with ID, tables, attributes and plot groups. inReadsFN --- an input FASTA file which has all consensus isoforms produced by pbtranscript.py cluster. This file is required to plot a read length histogram as part of the report: consensus_isoforms_readlength_hist.png inSummaryFN --- a summary TXT file with cluster attributes, including two attributes: number of consensus isoforms average length of consensus isoforms Attributes of the report are extracted from this file. """ log.info("Plotting read length histogram from file: {f}". format(f=inReadsFN)) # Collect read lengths of reader = ContigSet(inReadsFN) rs = [len(r.sequence) for r in reader] reader.close() readlengths = np.array(rs) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, outDir) readlength_group = PlotGroup(Constants.PG_READLENGTH, plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=inSummaryFN)) # Produce attributes based on summary. dataset_uuids = [ContigSet(inReadsFN).uuid] attributes = _report_to_attributes(inSummaryFN) r = load_report_from_json(inSummaryFN) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? table = attributesToTable(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, title=meta_rpt.title, attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return meta_rpt.apply_view(report)
def run_after(self, rtc, output_dir): report_file = rtc.task.output_files[0] r = load_report_from_json(report_file) a = {a.name: a.value for a in r.attributes} self.assertEqual(a, { 'num_below_min_accuracy': 0, 'num_not_converged': 0, 'num_insert_size_too_small': 0, 'num_too_many_unusable_subreads': 3, 'num_no_usable_subreads': 0, 'num_below_snr_threshold': 27, 'num_ccs_reads': 52, 'num_not_enough_full_passes': 58})
def test_smoke(self): t = get_temp_file(suffix="-stats-1.json") _write_stats_to_json({"n_reads": 549, "n_zmws": 100}, t) t2 = get_temp_file(suffix="-stats-2.json") _write_stats_to_json({"n_reads": 733, "n_zmws": 100}, t2) tg = get_temp_file(suffix="stats-gather.json") G.gather_report([t, t2], tg) r = load_report_from_json(tg) stats = {a.id: a.value for a in r.attributes} self.assertEqual(stats["pb_n_reads"], 549 + 733) self.assertEqual(stats["pb_n_zmws"], 200)
def test_smoke(self): t = get_temp_file(suffix="-stats-1.json") _write_stats_to_json({'n_reads': 549, 'n_zmws': 100}, t) t2 = get_temp_file(suffix="-stats-2.json") _write_stats_to_json({'n_reads': 733, 'n_zmws': 100}, t2) tg = get_temp_file(suffix="stats-gather.json") G.gather_report([t, t2], tg) r = load_report_from_json(tg) stats = {a.id: a.value for a in r.attributes} self.assertEqual(stats['pb_n_reads'], 549 + 733) self.assertEqual(stats['pb_n_zmws'], 200)
def summarize_report(report_file, out=sys.stdout): report = load_report_from_json(report_file) attr = {a.id: a.value for a in report.attributes} coverage = attr[Constants.A_COVERAGE] concordance = attr[Constants.A_CONCORDANCE] out.write("%s:\n" % report_file) out.write(" {n}: {a}\n".format(n=meta_rpt.get_meta_attribute( Constants.A_CONCORDANCE).name, a=concordance)) out.write(" {n}: {c}\n".format(n=meta_rpt.get_meta_attribute( Constants.A_COVERAGE).name, c=coverage)) return coverage == 1 and concordance == 1
def _get_variants_data(variants_rpt_file): """ Extract attributes from the variants report. :param variants_rpt_file: (str) path to the variants report :return dict: coverage and concordance """ rpt = load_report_from_json(variants_rpt_file) coverage = rpt.get_attribute_by_id(Constants.V_COVERAGE).value concordance = rpt.get_attribute_by_id(Constants.V_CONCORDANCE).value return { Constants.A_COVERAGE: coverage, Constants.A_CONCORDANCE: concordance }
def gather_report(json_files, output_file, dataset_xml=None): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [load_report_from_json(fn) for fn in json_files] merged = Report.merge(reports) if dataset_xml is not None: ds_md = get_dataset_metadata(dataset_xml) merged._dataset_uuids = [ds_md.uuid] with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def test_pbreports_reports_have_dataset_uuids(self): """Check that reports from pbreports list all input dataset UUIDs""" for file_id, file_info in self.datastore.get_file_dict().iteritems(): if not "pbreports" in file_info.file_id: continue if file_info.file_type_id == FileTypes.REPORT.file_type_id: r = load_report_from_json(file_info.path) report_uuids = set(r._dataset_uuids) ds_uuids = _get_rtc_dataset_uuids(file_info.path) for uuid in ds_uuids: self.assertTrue(uuid in report_uuids, "{p}: {l} not in {r}".format(p=file_info.path, l=uuid, r=report_uuids))
def get_report(self, report_id): """ Returns one of the pbreports outputs. report_id can be a sequence if the report lives under multiple names (e.g. for subreads/CCS). """ if isinstance(report_id, basestring): report_id = set([report_id]) for uuid, file_info in self.data.files.iteritems(): if file_info.file_type_id == FileTypes.REPORT.file_type_id: rpt = load_report_from_json(file_info.path) if rpt.id in report_id: return rpt raise IOError("Can't find report with ID {i}".format( i=" OR ".join(sorted(list(report_id)))))
def run_after(self, rtc, output_dir): report_file = rtc.task.output_files[0] r = load_report_from_json(report_file) a = {a.id: a.value for a in r.attributes} n = {a.id: a.name for a in r.attributes} self.assertEqual(a, {"mock_attr_2": 4, "mock_attr_3": 6, "mock_attr_0": 0, "mock_attr_1": 2, "mock_attr_4": 8}) self.assertEqual(n, {"mock_attr_2": "Attr 2", "mock_attr_3": "Attr 3", "mock_attr_0": "Attr 0", "mock_attr_1": "Attr 1", "mock_attr_4": "Attr 4"}) keys = [str(a.id) for a in r.attributes] # check attribute order self.assertEqual(keys, ["mock_attr_0", "mock_attr_1", "mock_attr_2", "mock_attr_3", "mock_attr_4"])
def run_after(self, rtc, output_dir): report_file = rtc.task.output_files[0] r = load_report_from_json(report_file) a = {a.name: a.value for a in r.attributes} self.assertEqual( a, { 'num_below_min_accuracy': 0, 'num_not_converged': 0, 'num_insert_size_too_small': 0, 'num_too_many_unusable_subreads': 3, 'num_no_usable_subreads': 0, 'num_below_snr_threshold': 27, 'num_ccs_reads': 52, 'num_not_enough_full_passes': 58 })
def summarize_report(report_file, out=sys.stdout): """ Utility function to harvest statistics from an existing report """ from pbcommand.pb_io.report import load_report_from_json W = lambda a: out.write(" {n}: {v}\n".format(n=a.name, v=a.value)) report = load_report_from_json(report_file) attr = {a.id: a for a in report.attributes} out.write("{f}:\n".format(f=report_file)) W(attr[Constants.A_SUBREAD_CONCORDANCE]) W(attr[Constants.A_NSUBREADS]) W(attr[Constants.A_NREADS]) W(attr[Constants.A_SUBREAD_NBASES]) W(attr[Constants.A_READLENGTH]) W(attr[Constants.A_SUBREAD_LENGTH])
def getReport(cls, report_id): if cls.service_access_layer is None: report_json = cls.datastore.get_report(report_id) assert report_json is not None, "Can't find %s" % report_id return load_report_from_json(report_json) else: # load report from services, not raw file for rpt_info in cls.service_access_layer.get_analysis_job_reports( cls.job_id): file_info = rpt_info['dataStoreFile'] source_id = file_info.file_id.split("-")[0] if source_id == report_id: report_d = cls.service_access_layer.get_analysis_job_report_details(cls.job_id, file_info.uuid) return dict_to_report(report_d) raise RuntimeError("Can't find {i} report".format(i=report_id))
def test_pbreports_reports_have_dataset_uuids(self): """Check that reports from pbreports list all input dataset UUIDs""" for file_id, file_info in self.datastore.get_file_dict().iteritems(): if not "pbreports" in file_info.file_id: continue if file_info.file_type_id == FileTypes.REPORT.file_type_id: r = load_report_from_json(file_info.path) report_uuids = set(r._dataset_uuids) ds_uuids = _get_rtc_dataset_uuids(file_info.path) for uuid in ds_uuids: self.assertTrue( uuid in report_uuids, "{p}: {l} not in {r}".format(p=file_info.path, l=uuid, r=report_uuids))
def summarize_report(report_file, out=sys.stdout): """ Utility function to harvest statistics from an existing report """ from pbcommand.pb_io.report import load_report_from_json W = lambda s: out.write(s + "\n") report = load_report_from_json(report_file) attr = {a.id: a.value for a in report.attributes} W("%s:" % report_file) W(" MEAN ACCURACY: {f}".format(f=attr[Constants.A_SUBREAD_ACCURACY])) W(" NSUBREADS: {n}".format(n=attr[Constants.A_NSUBREADS])) W(" NREADS: {n}".format(n=attr[Constants.A_NREADS])) W(" NBASES: {n}".format(n=attr[Constants.A_SUBREAD_NBASES])) W(" READLENGTH_MEAN: {n}".format(n=attr[Constants.A_READLENGTH])) W(" SUBREADLENGTH_MEAN: {n}".format(n=attr[Constants.A_SUBREAD_LENGTH]))
def summarize_report(report_file, out=sys.stdout): """ Utility function to harvest statistics from an existing report """ from pbcommand.pb_io.report import load_report_from_json W = lambda s: out.write(s + "\n") report = load_report_from_json(report_file) attr = {a.id: a.value for a in report.attributes} W("%s:" % report_file) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_SUBREAD_CONCORDANCE).name.upper(), a=attr[Constants.A_SUBREAD_CONCORDANCE])) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_NSUBREADS).name.upper(), a=attr[Constants.A_NSUBREADS])) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_NREADS).name.upper(), a=attr[Constants.A_NREADS])) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_SUBREAD_NBASES).name.upper(), a=attr[Constants.A_SUBREAD_NBASES])) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_READLENGTH).name.upper(), a=attr[Constants.A_READLENGTH])) W(" {n}: {a}".format(n=meta_rpt.get_meta_attribute(Constants.A_SUBREAD_LENGTH).name.upper(), a=attr[Constants.A_SUBREAD_LENGTH]))
def getReport(cls): if cls.service_access_layer is None: return cls.datastore.get_report(cls.REPORT_ID) else: report_id = cls.REPORT_ID if isinstance(cls.REPORT_ID, basestring): report_id = set([cls.REPORT_ID]) # load report from services, not raw file for rpt_info in cls.service_access_layer.get_analysis_job_reports( cls.job_id): file_info = rpt_info['dataStoreFile'] rpt = load_report_from_json(file_info.path) if rpt.id in report_id: return rpt raise IOError("Can't find report with ID {i}".format( i=" OR ".join(sorted(list(report_id)))))
def getReport(cls, report_id): if cls.service_access_layer is None: report_json = cls.datastore.get_report(report_id) assert report_json is not None, "Can't find %s" % report_id return load_report_from_json(report_json) else: # load report from services, not raw file for rpt_info in cls.service_access_layer.get_analysis_job_reports( cls.job_id): file_info = rpt_info['dataStoreFile'] source_id = file_info.file_id.split("-")[0] if source_id == report_id: report_d = cls.service_access_layer.get_analysis_job_report_details( cls.job_id, file_info.uuid) return dict_to_report(report_d) raise RuntimeError("Can't find {i} report".format(i=report_id))
def test_datastore_report_file_uuid(self): p = os.path.join(self.job_dir, "workflow", "datastore.json") with open(p, 'r') as r: d = json.loads(r.read()) n_tested = 0 for file_info in d['files']: if file_info['fileTypeId'] == FileTypes.REPORT.file_type_id: rpt = load_report_from_json(file_info['path']) self.assertEqual( rpt.uuid, file_info['uniqueId'], "{p}: {u1} != {u2}".format(p=file_info['path'], u1=rpt.uuid, u2=file_info['uniqueId'])) n_tested += 1 if n_tested == 0: raise unittest.SkipTest("No Report JSON files in datastore.")
def test_datastore_report_file_uuid(self): p = os.path.join(self.job_dir, "workflow", "datastore.json") with open(p, 'r') as r: d = json.loads(r.read()) n_tested = 0 for file_info in d['files']: if file_info['fileTypeId'] == FileTypes.REPORT.file_type_id: rpt = load_report_from_json(file_info['path']) self.assertEqual(rpt.uuid, file_info['uniqueId'], "{p}: {u1} != {u2}".format( p=file_info['path'], u1=rpt.uuid, u2=file_info['uniqueId'])) n_tested += 1 if n_tested == 0: raise unittest.SkipTest("No Report JSON files in datastore.")
def gather_report(json_files, output_file): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [load_report_from_json(fn) for fn in json_files] merged = Report.merge(reports) total_num_flnc_bases = 0 total_num_flnc = 0 for r in reports: attrs = {a.id: a.value for a in r.attributes} num_flnc = attrs["num_flnc"] num_flnc_bases = attrs["num_flnc_bases"] total_num_flnc += num_flnc total_num_flnc_bases += num_flnc_bases if total_num_flnc > 0: for a in merged.attributes: if a.id == "avg_flnc_len": # mimicking pbtranscript.io.Summary a._value = int(total_num_flnc_bases / total_num_flnc) log.info("Setting avg_flnc_len = {v}".format(v=a.value)) with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def gather_report(json_files, output_file): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [ load_report_from_json(fn) for fn in json_files ] merged = Report.merge(reports) total_num_flnc_bases = 0 total_num_flnc = 0 for r in reports: attrs = {a.id:a.value for a in r.attributes} num_flnc = attrs["num_flnc"] num_flnc_bases = attrs["num_flnc_bases"] total_num_flnc += num_flnc total_num_flnc_bases += num_flnc_bases if total_num_flnc > 0: for a in merged.attributes: if a.id == "avg_flnc_len": # mimicking pbtranscript.io.Summary a._value = int(total_num_flnc_bases / total_num_flnc) log.info("Setting avg_flnc_len = {v}".format(v=a.value)) with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def test_exit_code_0(self): bam = self.getAlignmentSet() var_rpt = os.path.join(DATA, "variants_report.json") mapping_rpt = os.path.join(DATA, "mapping_stats_report.json") cmd = "python -m pbreports.report.sat {o} {r} {c} {a} {v}".format( o=self._output_dir, r="rpt.json", c=bam, a=var_rpt, v=mapping_rpt ) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) rpt_file = os.path.join(self._output_dir, "rpt.json") rpt = load_report_from_json(rpt_file) self.assertEqual("sidney", rpt.get_attribute_by_id("instrument").value) self.assertEqual(1, rpt.get_attribute_by_id("coverage").value) self.assertEqual(1, rpt.get_attribute_by_id("concordance").value) self.assertEqual(7752, rpt.get_attribute_by_id("mapped_readlength_mean").value) self.assertEqual(48, rpt.get_attribute_by_id("reads_in_cell").value) out = StringIO() self.assertTrue(summarize_report(rpt_file, out=out)) validate_report_complete(self, rpt)
def test_reports_metadata_complete(self): """Check that metadata is non-null and nonempty""" for file_id, file_info in self.datastore.get_file_dict().iteritems(): if file_info.file_type_id == FileTypes.REPORT.file_type_id: report = load_report_from_json(file_info.path) self.is_not_none_or_empty_str(report.id, "Report ID") self.is_not_none_or_empty_str(report.title, "Report title") for attr in report.attributes: self.is_not_none_or_empty_str( attr.name, "Attribute {} name".format(attr.id)) for table in report.tables: self.is_not_none_or_empty_str( table.title, "Table {} title".format(table.id)) for col in table.columns: self.is_not_none_or_empty_str( col.header, "Column {} header".format(col.id)) for plotgroup in report.plotGroups: self.is_not_none_or_empty_str( plotgroup.title, "Plotgroup {} title".format(plotgroup.id)) for plot in plotgroup.plots: self.is_not_none_or_empty_str( plot.title, "Plot {} title".format(plot.id)) self.is_not_none_or_empty_str( plot.caption, "Plot {} caption".format(plot.id))
def _to_report(self): return load_report_from_json(self.report_json)
def make_report(contig_set, summary_txt, output_dir): """ Generate a report with ID, tables, attributes and plot groups. :param contig_set: an input FASTA file which has all full-length, non-chimeric reads produced by pbtranscript.py classify. This file is required to plot a read length histogram as part of the report: fulllength_nonchimeric_readlength_hist.png :param summary_txt: a summary TXT file with classify attributes, including 6 attributes, number of consensus reads number of five prime reads, number of three prime reads, number of poly-A reads, number of full-length non-chimeric reads, average full-length n on-chimeric read length Attributes of the report are extracted from this file. :type contig_set: str :type summary_txt: str :type output_dir: str :rtype: Report """ log.info( "Plotting read length histogram from file: {f}".format(f=contig_set)) # Collect read lengths of def _get_reads(): with ContigSet(contig_set) as f: for record in f: yield len(record.sequence) readlengths = np.fromiter(_get_reads(), dtype=np.int64, count=-1).astype(float) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, output_dir) readlength_group = PlotGroup(Constants.PG_READLENGTH, plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info( "Plotting summary attributes from file: {f}".format(f=summary_txt)) dataset_uuids = [ContigSet(contig_set).uuid] # Produce attributes based on summary. attributes = report_to_attributes(summary_txt) r = load_report_from_json(summary_txt) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? table = attributes_to_table(attributes, Constants.T_ATTR) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, tables=[table], attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return spec.apply_view(report)
def report_to_attributes(summary_json): report = load_report_from_json(summary_json) return [Attribute(attr.id, attr.value) for attr in report.attributes]
def _report_to_attributes(report_file): report = load_report_from_json(report_file) return report.attributes
def make_report(contig_set, summary_txt, output_dir): """ Generate a report with ID, tables, attributes and plot groups. :param contig_set: an input FASTA file which has all full-length, non-chimeric reads produced by pbtranscript.py classify. This file is required to plot a read length histogram as part of the report: fulllength_nonchimeric_readlength_hist.png :param summary_txt: a summary TXT file with classify attributes, including 6 attributes, number of consensus reads number of five prime reads, number of three prime reads, number of poly-A reads, number of full-length non-chimeric reads, average full-length n on-chimeric read length Attributes of the report are extracted from this file. :type contig_set: str :type summary_txt: str :type output_dir: str :rtype: Report """ log.info("Plotting read length histogram from file: {f}". format(f=contig_set)) # Collect read lengths of def _get_reads(): with ContigSet(contig_set) as f: for record in f: yield len(record.sequence) readlengths = np.fromiter(_get_reads(), dtype=np.int64, count=-1).astype(float) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, output_dir) readlength_group = PlotGroup(Constants.PG_READLENGTH, plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=summary_txt)) dataset_uuids = [ContigSet(contig_set).uuid] # Produce attributes based on summary. attributes = _report_to_attributes(summary_txt) r = load_report_from_json(summary_txt) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? table = _attributes_to_table(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, tables=[table], attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return spec.apply_view(report)
def _report_to_attributes(summary_json): report = load_report_from_json(summary_json) return [Attribute(attr.id, attr.value) for attr in report.attributes]
def _compare_reports(self, rpt_json1, rpt_json2): rpt1 = load_report_from_json(rpt_json1) rpt2 = load_report_from_json(rpt_json2) attr1 = {a.id: a.value for a in rpt1.attributes} attr2 = {a.id: a.value for a in rpt2.attributes} self.assertEqual(attr1, attr2)