def _create_runnable_task(task_id, input_names, output_names): """Write the necessary files and generate the cmds""" input_files = [get_temp_file(suffix=x) for x in input_names] output_files = [get_temp_file(suffix=x) for x in output_names] d = get_temp_dir("runnable-task-") msg = "MOCK DATA" cmds = [_to_cmd(msg, x) for x in output_files] return _to_runnable_task(task_id, input_files, output_files, cmds, d)
def test_smoke(self): t = get_temp_file(suffix="-stats-1.json") _write_stats_to_json({"n_reads": 549, "n_zmws": 100}, t) t2 = get_temp_file(suffix="-stats-2.json") _write_stats_to_json({"n_reads": 733, "n_zmws": 100}, t2) tg = get_temp_file(suffix="stats-gather.json") G.gather_report([t, t2], tg) r = load_report_from_json(tg) stats = {a.id: a.value for a in r.attributes} self.assertEqual(stats["pb_n_reads"], 549 + 733) self.assertEqual(stats["pb_n_zmws"], 200)
def test_filter_dataset_bq(self): ds_in = get_temp_file(suffix=".subreadset.xml") ds = SubreadSet(pbtestdata.get_file("barcoded-subreadset"), strict=True) ds.filters.addRequirement(bq=[('>=', 31)]) assert len(ds) == 1 ds.write(ds_in) ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, "length >= 10 AND bq >= 10"] self._check_call(args) n_expected = 2 expected_filter_str = "( bq >= 10 AND length >= 10 )" self.run_after(ds_out, n_expected, expected_filter_str)
def test_filter_dataset_downsample(self): ds_in = get_temp_file(suffix=".subreadset.xml") with SubreadSet(pbtestdata.get_file("subreads-xml"), strict=True) as ds: assert len(ds) == 117 and len(ds.filters) == 0 ds.write(ds_in) ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, "", "--downsample", "2"] self._check_call(args) n_expected = 54 expected_filter_str = "( Uint32Cast(zm) % 2 == 0 )" ds = self.run_after(ds_out, n_expected, expected_filter_str) assert "downsampled" in ds.tags
def test_smoke(self): t = get_temp_file(suffix="-stats-1.json") _write_stats_to_json({'n_reads': 549, 'n_zmws': 100}, t) t2 = get_temp_file(suffix="-stats-2.json") _write_stats_to_json({'n_reads': 733, 'n_zmws': 100}, t2) tg = get_temp_file(suffix="stats-gather.json") G.gather_report([t, t2], tg) r = load_report_from_json(tg) stats = {a.id: a.value for a in r.attributes} self.assertEqual(stats['pb_n_reads'], 549 + 733) self.assertEqual(stats['pb_n_zmws'], 200)
class TestFilterDataSet(PbTestApp): TASK_ID = "pbcoretools.tasks.filterdataset" DRIVER_EMIT = 'python -m pbcoretools.tasks.filters emit-tool-contract {i} '.format( i=TASK_ID) DRIVER_RESOLVE = 'python -m pbcoretools.tasks.filters run-rtc ' TASK_OPTIONS = {"pbcoretools.task_options.other_filters": "length <= 1400"} RESOLVED_TASK_OPTIONS = { "pbcoretools.task_options.other_filters": "length <= 1400" } INPUT_FILES = [get_temp_file(suffix=".subreadset.xml")] MAX_NPROC = 24 RESOLVED_NPROC = 1 IS_DISTRIBUTED = True RESOLVED_IS_DISTRIBUTED = True READER_CLASS = SubreadSet @classmethod def setUpClass(cls): ds = SubreadSet(data.getXml(10), strict=True) ds.write(cls.INPUT_FILES[0]) def _get_counts(self, rtc): n_expected = 18 with self.READER_CLASS(rtc.task.output_files[0]) as f: n_actual = len(f) return n_expected, n_actual def _get_filters(self, rtc): with self.READER_CLASS(rtc.task.output_files[0]) as f: return str(f.filters) def run_after(self, rtc, output_dir): n_expected, n_actual = self._get_counts(rtc) self.assertEqual(self._get_filters(rtc), "( length <= 1400 )") self.assertEqual(n_actual, n_expected)
def test_run(self): root_output_dir = self._get_root_temp_dir() i = random.randint(1, 10000) name = "{n}_{i}".format(n=self.JOB_NAME, i=i) output_dir = os.path.join(root_output_dir, name) os.mkdir(output_dir) ep_d = {ep_id: get_temp_file(suffix=name) for ep_id, name in self.ENTRY_POINTS.iteritems()} for ep_id, file_name in ep_d.iteritems(): with open(file_name, 'w') as x: x.write("Mock data for {i} \n".format(i=ep_id)) cmd = self.TO_CMD_FUNC(output_dir, self.WORKFLOW_XML, self.PRESET_XML, ep_d) stderr_path = os.path.join(output_dir, 'job.stderr') stdout_path = os.path.join(output_dir, 'job.stdout') log.debug(cmd) with open(stdout_path, 'w') as wo: with open(stderr_path, 'w') as we: rcode, stdout_results, stderr_results, run_time = run_command(cmd, wo, we) log.debug("Integration Job {i} state {s} in {t:.2f} sec.".format(i=self.JOB_NAME, s=rcode, t=run_time)) if rcode != 0: log.error("Integration Job {i} failed.".format(i=self.JOB_NAME)) log.error(stdout_results) log.error(stderr_results) if os.path.exists(stderr_path): with open(stderr_path, 'r') as f: log.error(f.read()) emsg = "Failed Integration Job {i} with exit code {r} in {d}. {w}".format(i=self.JOB_NAME, r=rcode, d=output_dir, w=self.WORKFLOW_XML) self.assertEqual(rcode, 0, emsg)
def test_run(self): root_output_dir = self._get_root_temp_dir() i = random.randint(1, 10000) name = "{n}_{i}".format(n=self.JOB_NAME, i=i) output_dir = os.path.join(root_output_dir, name) os.mkdir(output_dir) ep_d = {ep_id: get_temp_file(suffix=name) for ep_id, name in self.ENTRY_POINTS.iteritems()} for ep_id, file_name in ep_d.iteritems(): with open(file_name, 'w') as x: x.write("Mock data for {i} \n".format(i=ep_id)) cmd = self.TO_CMD_FUNC(output_dir, self.WORKFLOW_XML, self.PRESET_JSON, self.PRESET_XML, ep_d) stderr_path = os.path.join(output_dir, 'job.stderr') stdout_path = os.path.join(output_dir, 'job.stdout') log.debug(cmd) with open(stdout_path, 'w') as wo: with open(stderr_path, 'w') as we: rcode, stdout_results, stderr_results, run_time = run_command(cmd, wo, we) log.debug("Integration Job {i} state {s} in {t:.2f} sec.".format(i=self.JOB_NAME, s=rcode, t=run_time)) if rcode != 0 and self.EXPECTED_EXIT_CODE == 0: log.error("Integration Job {i} failed.".format(i=self.JOB_NAME)) log.error(stdout_results) log.error(stderr_results) if os.path.exists(stderr_path): with open(stderr_path, 'r') as f: log.error(f.read()) emsg = "Failed Integration Job {i} with exit code {r} in {d}. {w}".format(i=self.JOB_NAME, r=rcode, d=output_dir, w=self.WORKFLOW_XML) self.assertEqual(rcode, self.EXPECTED_EXIT_CODE, emsg)
class _BaseTestBam2Fasta(PbTestApp): TASK_ID = "pbcoretools.tasks.bam2fasta" DRIVER_EMIT = 'python -m pbcoretools.tasks.converters emit-tool-contract {i} '.format( i=TASK_ID) DRIVER_RESOLVE = 'python -m pbcoretools.tasks.converters run-rtc ' INPUT_FILES = [get_temp_file(suffix=".subreadset.xml")] MAX_NPROC = 24 RESOLVED_NPROC = 1 IS_DISTRIBUTED = True RESOLVED_IS_DISTRIBUTED = True READER_CLASS = FastaReader NRECORDS_EXPECTED = None def _get_output_file(self, rtc): return rtc.task.output_files[0] def _get_counts(self, rtc): with openDataSet(self.INPUT_FILES[0]) as ds: n_expected = len([rec for rec in ds]) with self.READER_CLASS(self._get_output_file(rtc)) as f: n_actual = len([rec for rec in f]) return n_expected, n_actual def run_after(self, rtc, output_dir): n_expected, n_actual = self._get_counts(rtc) self.assertEqual(n_actual, n_expected) if self.NRECORDS_EXPECTED is not None: self.assertEqual(n_actual, self.NRECORDS_EXPECTED)
class TestGatherGFF(TextRecordsGatherBase, pbcommand.testkit.core.PbTestGatherApp): """ Test pbcoretools.tasks.gather_gff """ RECORDS = [ "contig1\tkinModCall\tmodified_base\t1\t1\t31\t+\t.\tcoverage=169", "contig1\tkinModCall\tmodified_base\t2\t2\t41\t-\t.\tcoverage=170", "contig1\tkinModCall\tmodified_base\t3\t3\t51\t+\t.\tcoverage=168", "contig1\tkinModCall\tmodified_base\t4\t4\t60\t-\t.\tcoverage=173", ] RECORD_HEADER = "##gff-version 3\n##source-id ipdSummary\n" EXTENSION = "gff" DRIVER_BASE = "python -m pbcoretools.tasks.gather_gff" INPUT_FILES = [get_temp_file(suffix=".chunks.json")] CHUNK_KEY = "$chunk.gff_id" @classmethod def _get_chunk_records(cls, i_chunk): if i_chunk == 0: return cls.RECORDS[2:] else: return cls.RECORDS[0:2] def _get_lines(self, lines): return [l.strip() for l in lines if l[0] != '#'] def validate_content(self, lines): self.assertEqual(len(lines), 6) self.assertEqual(lines[1].strip(), "##source-id ipdSummary")
def setUpClass(cls): pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID] log.debug(pipeline) cls.bindings = pipeline.all_bindings cls.EPOINTS_D = { k: get_temp_file(v) for k, v in cls.EPOINTS_NAMES.iteritems() } log.debug(pprint.pformat(cls.bindings, indent=4)) log.debug( "Number of registered tasks {n}".format(n=len(REGISTERED_TASKS))) cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS, cls.bindings) d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser( ) == 'mkocher' else None cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d) preset_record = IO.parse_pipeline_preset_xml( os.path.join(TEST_DATA_DIR, cls.PRESET_XML)) cls.workflow_options = preset_record.to_workflow_level_opt() # leave this for now cls.envs = [] cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
def _to_test_fofn(n, file_base_name): """return a path to fofn""" fofn_name = ".".join([file_base_name, ".fofn"]) fofn = get_temp_file(suffix=fofn_name) fofn_files = [] for i in xrange(n): name = "-".join([file_base_name, str(i)]) f = get_temp_file(name) fofn_files.append(f) with open(fofn, 'w') as f: f.write("\n".join(fofn_files)) return fofn
def test_01(self): nrecords = 15 name = "example_fofn" f = get_temp_file(name) _ = M.write_random_fofn(f, nrecords) ds_metadata = dispatch_metadata_resolver(self.FILE_TYPE, f) self.assertEquals(ds_metadata.nrecords, nrecords) os.remove(f)
def test_bam2fastx_filtered(self): input_file = pbtestdata.get_file("subreads-xml") ds = SubreadSet(input_file, strict=True) ds.filters.addRequirement(length=[('>=', 1000)]) input_tmp = get_temp_file(suffix=".subreadset.xml") ds.write(input_tmp) nrecords_expected = 13 self.run_and_check_fastx(input_tmp, nrecords_expected)
def test_filter_dataset_combine_filters(self): ds_in, n_input = self._set_up_combine_filters() ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, "rq >= 0.901"] self._check_call(args) n_expected = 12 expected_filter_str = "( length >= 1000 AND rq >= 0.901 )" self.run_after(ds_out, n_expected, expected_filter_str)
def test_filter_dataset_min_rq_null(self): ds_in, n_input = self._set_up_combine_filters() ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, "", "--min-qv", "-1"] self._check_call(args) n_expected = 13 expected_filter_str = "( length >= 1000 )" self.run_after(ds_out, n_expected, expected_filter_str)
def test_filter_dataset_nofilter(self): ds_in, n_input = self._set_up_basic() ds_out = get_temp_file(suffix=".subreadset.xml") args = self.BASE_ARGS + [ds_in, ds_out, ""] self._check_call(args) n_expected = n_input expected_filter_str = "" self.run_after(ds_out, n_expected, expected_filter_str)
def test_smoke(self): t = get_temp_file(suffix="-records-1.csv") _write_records_to_csv(list(_to_n_records(100)), t) t2 = get_temp_file(suffix="-records-2.csv") _write_records_to_csv(list(_to_n_records(57)), t2) tg = get_temp_file(suffix="records-gather.csv") G.gather_csv([t, t2], tg) nrecords = 0 with open(tg, "r") as r: reader = csv.DictReader(r) log.debug(reader.fieldnames) for _ in reader: nrecords += 1 self.assertEqual(nrecords, 157)
def _set_up_combine_filters(self): ds_in = get_temp_file(suffix=".subreadset.xml") with SubreadSet(pbtestdata.get_file("subreads-xml"), strict=True) as ds: assert len(ds) == 117 and len(ds.filters) == 0 ds.filters.addRequirement(length=[('>=', 1000)]) assert len(ds) == 13 ds.write(ds_in) return ds_in, 13
def _set_up_basic(self): input_file = get_temp_file(suffix=".subreadset.xml") ds = SubreadSet(data.getXml(9), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") ds.write(input_file) return input_file, len(ds)
def test_smoke(self): t = get_temp_file(suffix="-records-1.csv") _write_records_to_csv(list(_to_n_records(100)), t) t2 = get_temp_file(suffix="-records-2.csv") _write_records_to_csv(list(_to_n_records(57)), t2) tg = get_temp_file(suffix="records-gather.csv") G.gather_csv([t, t2], tg) nrecords = 0 with open(tg, 'r') as r: reader = csv.DictReader(r) log.debug(reader.fieldnames) for _ in reader: nrecords += 1 self.assertEqual(nrecords, 157)
def test_load_preset(self): xml = IO.schema_workflow_options_to_xml(self._to_opts()) preset_xml = get_temp_file(suffix="_preset.xml") log.debug(preset_xml) with open(preset_xml, 'w') as w: w.write(str(xml)) preset_record = IO.parse_pipeline_preset_xml(preset_xml) workflow_level_opts = preset_record.to_workflow_level_opt() self.assertTrue(len(workflow_level_opts), len(self._to_opts()))
def test_combine_filters_run_filter_dataset(self): ds_in, n_input = self._set_up_combine_filters() ds_out = get_temp_file(suffix=".subreadset.xml") my_filters = "rq >= 0.901" run_filter_dataset(ds_in, ds_out, 0, my_filters) with openDataSet(ds_out, strict=True) as ds: assert len(ds) == 12 my_filters = "rq >= 0.8" run_filter_dataset(ds_in, ds_out, 500, my_filters) with openDataSet(ds_out, strict=True) as ds: assert len(ds) == 48
class TestScatterContigSet(ScatterSequenceBase, pbcommand.testkit.core.PbTestScatterApp): """ Test pbsmrtpipe.tools_dev.scatter_contigset """ READER_CLASS = ContigSet DRIVER_BASE = "python -m pbsmrtpipe.tools_dev.scatter_contigset" INPUT_FILES = [get_temp_file(suffix=".contigset.xml")] MAX_NCHUNKS = 12 RESOLVED_MAX_NCHUNKS = 12 CHUNK_KEYS = ("$chunk.contigset_id", )
class TestScatterFilterFasta(ScatterSequenceBase, pbcommand.testkit.core.PbTestScatterApp): """ Test pbcoretools.tasks.scatter_filter_fasta """ READER_CLASS = FastaReader DRIVER_BASE = "python -m pbcoretools.tasks.scatter_filter_fasta" INPUT_FILES = [get_temp_file(suffix=".fasta")] MAX_NCHUNKS = 12 RESOLVED_MAX_NCHUNKS = 12 CHUNK_KEYS = ("$chunk.fasta_id", )
def test_all_sane(self): """Test that all pipelines are well defined""" errors = [] rtasks, rfiles_types, chunk_operators, pipelines = L.load_all() for pipeline_id, pipeline in pipelines.items(): emsg = "Pipeline {p} is not valid.".format(p=pipeline_id) log.debug("Checking Sanity of registered Pipeline {i}".format( i=pipeline_id)) log.info(pipeline_id) log.debug(pipeline) try: # Validate with Avro d = pipeline_template_to_dict(pipeline, rtasks) _ = validate_pipeline_template(d) name = pipeline_id + "_pipeline_template.avro" output_file = get_temp_file(suffix=name) log.info( "{p} converted to avro successfully".format(p=pipeline_id)) bg = BU.binding_strs_to_binding_graph(rtasks, pipeline.all_bindings) BU.validate_binding_graph_integrity(bg) BU.validate_compatible_binding_file_types(bg) validate_entry_points(d) # pprint.pprint(d) # for debugging purposes output_json = output_file.replace(".avro", '.json') log.info("writing pipeline to {p}".format(p=output_json)) with open(output_json, 'w') as j: j.write(json.dumps(d, sort_keys=True, indent=4)) log.info( "writing pipeline template to {o}".format(o=output_file)) # Test writing to avro if the pipeline is actually valid write_pipeline_template_to_avro(pipeline, rtasks, output_file) log.info("Pipeline {p} is valid.".format(p=pipeline_id)) log.info("Loading avro {i} from {p}".format(i=pipeline_id, p=output_file)) pipeline_d = load_pipeline_template_from_avro(output_file) self.assertIsInstance(pipeline_d, dict) except Exception as e: m = emsg + " Error: " + e.message log.error(m) errors.append(m) log.error(emsg) log.error(e) msg = "\n".join(errors) if errors else "" self.assertEqual([], errors, msg)
class TestBam2FastaCCS(TestBam2FastqArchive): TASK_ID = "pbcoretools.tasks.bam2fasta_ccs" DRIVER_EMIT = 'python -m pbcoretools.tasks.converters emit-tool-contract {i} '.format( i=TASK_ID) INPUT_FILES = [get_temp_file(".consensusreadset.xml")] READER_CLASS = FastaReader NRECORDS_EXPECTED = None @classmethod def setUpClass(cls): ds = ConsensusReadSet(pbcore.data.getCCSBAM(), strict=True) ds.write(cls.INPUT_FILES[0])
class TestScatterContigSet(TestScatterFilterFasta, pbcommand.testkit.core.PbTestScatterApp): """ Test pbcoretools.tasks.scatter_contigset """ NRECORDS = 51 READER_CLASS = ContigSet DRIVER_BASE = "python -m pbcoretools.tasks.scatter_contigset" INPUT_FILES = [get_temp_file(suffix=".contigset.xml")] MAX_NCHUNKS = 24 RESOLVED_MAX_NCHUNKS = 24 NCHUNKS_EXPECTED = 17 CHUNK_KEYS = ("$chunk.contigset_id", )
def test_write_chunks(self): def f(i): return {"{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i)} to_i = lambda i: "chunk-id-{i}".format(i=i) to_p = lambda i: PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in xrange(nchunks)] log.debug(pipeline_chunks) tmp_name = get_temp_file("_chunk.json") IO.write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = IO.load_pipeline_chunks_from_json(tmp_name) self.assertEquals(len(pchunks), nchunks)
def setUpClass(cls): pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID] log.debug(pipeline) cls.bindings = pipeline.all_bindings cls.EPOINTS_D = {k: get_temp_file(v) for k, v in cls.EPOINTS_NAMES.iteritems()} log.debug(pprint.pformat(cls.bindings, indent=4)) log.debug("Number of registered tasks {n}".format(n=len(REGISTERED_TASKS))) cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS, cls.bindings) d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser() == 'mkocher' else None cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d) preset_record = IO.parse_pipeline_preset_xml(os.path.join(TEST_DATA_DIR, cls.PRESET_XML)) cls.workflow_options = preset_record.to_workflow_level_opt() # leave this for now cls.envs = [] cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
class TestGatherCSV(TextRecordsGatherBase, pbcommand.testkit.core.PbTestGatherApp): """ Test pbcoretools.tasks.gather_csv """ RECORDS = [ "contig1,3000000,170", "contig2,90000,180", "contig3,58000,159", "contig4,20000,160", ] RECORD_HEADER = "contig_id,length,coverage\n" EXTENSION = "csv" DRIVER_BASE = "python -m pbcoretools.tasks.gather_csv" INPUT_FILES = [get_temp_file(suffix=".chunks.json")] CHUNK_KEY = "$chunk.csv_id" def _get_lines(self, lines): return [l.strip() for l in lines[1:]]
class TestGatherGFF(TextRecordsGatherBase, pbcommand.testkit.core.PbTestGatherApp): """ Test pbsmrtpipe.tools_dev.gather_gff """ RECORDS = [ "contig1\tkinModCall\tmodified_base\t1\t1\t31\t+\t.\tcoverage=169", "contig1\tkinModCall\tmodified_base\t2\t2\t41\t-\t.\tcoverage=170", "contig1\tkinModCall\tmodified_base\t3\t3\t51\t+\t.\tcoverage=168", "contig1\tkinModCall\tmodified_base\t4\t4\t60\t-\t.\tcoverage=173", ] RECORD_HEADER = None EXTENSION = "gff" DRIVER_BASE = "python -m pbsmrtpipe.tools_dev.gather_gff" INPUT_FILES = [get_temp_file(suffix=".chunks.json")] CHUNK_KEY = "$chunk.gff_id" def _get_lines(self, lines): return [l.strip() for l in lines if l[0] != '#']
def test_write_chunks(self): def f(i): return { "{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i) } to_i = lambda i: "chunk-id-{i}".format(i=i) to_p = lambda i: PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in xrange(nchunks)] log.debug(pipeline_chunks) tmp_name = get_temp_file("_chunk.json") IO.write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = IO.load_pipeline_chunks_from_json(tmp_name) self.assertEquals(len(pchunks), nchunks)
def test_01(self): p = get_temp_file(self.FILE_NAME) self._write_mock_file(p) ds_metadata = dispatch_metadata_resolver(self.FILE_TYPE, p) self.assertIsInstance(ds_metadata, DatasetMetadata) self.assertEquals(ds_metadata.nrecords, self.NRECORDS)