def test_align_and_extract_umis(new_pipegraph): from mbf_align.post_process import AnnotateFastqBarcodes for folder in [ get_sample_path(Path("mbf_align/sample_extract_barcodes")), get_sample_path(Path("mbf_align/sample_extract_barcodes_gz")), ]: new_pipegraph.new_pipegraph() genome = get_human_22_fake_genome() mbf_qualitycontrol.prune_qc(lambda _: False) r = Sample("test", str(folder), False, pairing="only_second", vid="AA123") al = AlignedSample("test", str(folder / "test.bam"), genome, False, "AA123") x = al.post_process( AnnotateFastqBarcodes(r, { "XC": [0, 4], "XM": [7, 7 + 4] })) ppg.run_pipegraph() f = x.get_bam() r = next(f.fetch()) print(r.tags) assert r.get_tag("XC") == "AGTC" assert r.get_tag("XM") == "TGAC"
def test_pairing_invalid_value(self): with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing="do_what_you_want", ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=False, ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=None, ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=[5], )
def test_lane_paired_only_second(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="only_second", ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert len(temp_job.filenames) == 1 assert Path(real_job.filenames[0]).exists() assert len(real_job.filenames) == 1 assert not "_R1_" in real_job.filenames[0] assert ".fastq.gz" in real_job.filenames[0] should = b"" for input_fn in [ (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")) ]: with gzip.GzipFile(input_fn, "r") as op: should += op.read() with gzip.GzipFile(real_job.filenames[0], "r") as op: actual = op.read() assert actual == should
def test_lane_paired_filtered(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="paired", fastq_processor=fastq2.Paired_Filtered(lambda *args: True), ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert not Path(temp_job.filenames[1]).exists() assert Path(real_job.filenames[0]).exists() assert Path(real_job.filenames[1]).exists() assert "_R1_" in real_job.filenames[0] assert "_R2_" in real_job.filenames[1] assert ".fastq.gz" in real_job.filenames[0] assert ".fastq.gz" in real_job.filenames[1] for input_fn, output_fn in zip( [ (get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")), (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")), ], real_job.filenames, ): with gzip.GzipFile(output_fn, "r") as op: actual = op.read() with gzip.GzipFile(input_fn, "r") as op: should = op.read() assert actual == should
def test_paired_modes(self): with pytest.raises(PairingError): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", ) lane.prepare_input()
def test_fastqc(self): from mbf_qualitycontrol import get_qc_jobs lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000" ) qc_jobs = list(get_qc_jobs()) assert len(qc_jobs) == 1 assert "results/lanes/Sample_a/FASTQC/sentinel.txt" in qc_jobs[0].filenames assert lane.prepare_input() in qc_jobs[0].prerequisites
def test_lane_paired_missing_R2(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000", pairing="paired", ) with pytest.raises(PairingError): lane.prepare_input()
def test_lane(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000" ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert Path(real_job.filenames[0]).exists() with gzip.GzipFile(real_job.filenames[0], "r") as op: lines = op.readlines() assert len(lines) == 20 + 20
def test_lane_with_job_generating_fastq(self): def gen_fastq(fn): with open(fn, "wb") as op: op.write(b"@shu\nAGTC\n+\nasdf") job = FileGeneratingJob("input.fastq", gen_fastq) lane = Sample("Sample_a", job, False, vid="VA000") assert lane.vid == "VA000" temp_job = lane.prepare_input() assert job in temp_job.prerequisites real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert Path(real_job.filenames[0]).exists() with gzip.GzipFile(real_job.filenames[0], "r") as op: lines = op.readlines() assert len(lines) == 4
def test_align_parameterDependencyChecking(self, local_store): class FakeGenome: name = "FakeGenome" def build_index(self, aligner, fasta_to_use=None, gtf_to_use=None): job = ppg.FileGeneratingJob( "fake_index", lambda: Path("fake_index").write_text("hello") ) job.output_path = "fake_index" return job class FakeAligner: name = "FakeAligner" version = "0.1" def align_job( self, input_fastq, paired_end_filename, index_basename, output_bam_filename, parameters, ): job = ppg.MultiFileGeneratingJob( [output_bam_filename, str(output_bam_filename) + ".bai"], lambda: 5 ) # job.depends_on_params("") # that's the line we check return job aligner = FakeAligner() lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="paired", ) genome = FakeGenome() params = {"shu": 123} with pytest.raises(ppg.JobContractError): lane.align(aligner, genome, params)
def test_align(self, local_store): import json import gzip class FakeGenome: name = "FakeGenome" def download_genome(self): return [] def job_genes(self): return [] def job_transcripts(self): return [] def build_index(self, aligner, fasta_to_use=None, gtf_to_use=None): job = ppg.FileGeneratingJob( "fake_index", lambda: Path("fake_index").write_text("hello") ) job.output_path = "fake_index" return job class FakeAligner: name = "FakeAligner" version = "0.1" def align_job( self, input_fastq, paired_end_filename, index_basename, output_bam_filename, parameters, ): def align(): with open(output_bam_filename, "w") as op: json.dump( [ open(input_fastq).read(200), open(paired_end_filename).read(200) if paired_end_filename else "", index_basename, str(parameters), ], op, ) with open(str(output_bam_filename) + ".bai", "w") as op: op.write("Done") job = ppg.MultiFileGeneratingJob( [output_bam_filename, str(output_bam_filename) + ".bai"], align ) job.depends_on_params("") return job aligner = FakeAligner() lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="paired", ) genome = FakeGenome() params = {"shu": 123} aligned_lane = lane.align(aligner, genome, params) ppg.run_pipegraph() assert Path("fake_index").exists() assert Path("fake_index").read_text() == "hello" assert aligned_lane.load()[0].filenames[0].endswith(lane.name + ".bam") assert aligned_lane.load()[0].filenames[1].endswith(lane.name + ".bam.bai") assert Path(aligned_lane.load()[0].filenames[0]).exists() with open(aligned_lane.load()[0].filenames[0]) as op: actual = json.load(op) with gzip.GzipFile( get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz") ) as op: should_0 = op.read(200).decode("utf-8") with gzip.GzipFile( get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz") ) as op: should_1 = op.read(200).decode("utf-8") assert actual[0] == should_0 assert actual[1] == should_1 assert actual[2] == "fake_index" assert actual[3] == str(params)
def test_lane_raises_on_pe_as_se(self): lane = Sample("Sample_a", get_sample_data(Path("mbf_align/sample_b")), False) with pytest.raises(PairingError): lane.prepare_input()