def test_from_existing_bam(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") assert lane.name == "test_lane" assert lane.load()[0] is bam_job assert isinstance(lane.load()[1], ppg.FileInvariant) assert lane.genome is genome assert not lane.is_paired assert lane.vid == "AA123" with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True, "AA123") assert lane2.is_paired b = lane.get_bam() assert isinstance(b, pysam.Samfile) b = lane.get_unique_aligned_bam() assert isinstance(b, pysam.Samfile) assert lane.get_bam_names()[0] == bam_path assert lane.get_bam_names()[1] == bam_path + ".bai" assert lane.mapped_reads() == 8 assert lane.unmapped_reads() == 0 for job in get_qc_jobs(): assert job._pruned
def test_subtraction_by_read(self): from mbf_sampledata import get_human_22_fake_genome genome = get_human_22_fake_genome() lane = mbf_align.AlignedSample( "test_lane", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", ) # index creation is automatic lane2 = mbf_align.AlignedSample( "test_lane2", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA124", ) # index creation is automatic lane3 = mbf_align.AlignedSample( "test_lane3", get_sample_data(Path("mbf_align/chipseq_chr22.bam")), genome, False, "AA123", ) # index creation is automatic lane3_subset = mbf_align.AlignedSample( "test_lane3_subset", get_sample_data(Path("mbf_align/chipseq_chr22_subset.bam")), genome, False, "AA123", ) # index creation is automatic lane_empty = lane.post_process( mbf_align.post_process.SubtractOtherLane(lane2), new_name="empty") lane_full = lane.post_process( mbf_align.post_process.SubtractOtherLane(lane3), new_name="full") lane_some = lane3.post_process( mbf_align.post_process.SubtractOtherLane(lane3_subset), result_dir="results/aligned/shu", ) qc_jobs = [ lane_some.post_processor_qc_jobs, lane_full.post_processor_qc_jobs ] prune_qc(lambda job: job in qc_jobs) ppg.run_pipegraph() assert Path(lane_empty.get_bam_names()[1]).exists() assert Path(lane_full.get_bam_names()[1]).exists() assert lane_empty.mapped_reads() == 0 assert lane_full.mapped_reads() == lane.mapped_reads() assert lane.mapped_reads() != 0 assert (lane_some.mapped_reads() == lane3.mapped_reads() - lane3_subset.mapped_reads()) assert lane3_subset.mapped_reads( ) # make sure there was something to subtract assert "shu" in lane_some.get_bam_names()[0] assert_image_equal(qc_jobs[0].filenames[0], "_result_dir") assert_image_equal(qc_jobs[0].filenames[0])
def test_to_fastq(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") fastq_path = "out.fastq" lane.to_fastq(fastq_path) ppg.run_pipegraph() assert Path(fastq_path).exists() assert (Path(fastq_path).read_text() == """@read_28833_29006_6945 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @read_28701_28881_323b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_323c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324a TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< """) lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, is_paired=True, vid="AA123") with pytest.raises(ValueError): lane2.to_fastq( "nope.fastq") # no support for paired end data at this point
def prep_lane(self): from mbf_sampledata import get_human_22_fake_genome # straight from chr22 of the human genome genome = get_human_22_fake_genome() lane = mbf_align.AlignedSample( "test_lane", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", ) return lane
def test_chromosome_mapping(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = DummyGenome() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") assert lane.name == "test_lane" assert lane.load()[0] is bam_job assert isinstance(lane.load()[1], ppg.FileInvariant) assert lane.genome is genome assert not lane.is_paired assert lane.vid == "AA123" with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True, "AA123") assert lane2.is_paired b = lane.get_bam() assert isinstance(b, pysam.Samfile) b
def gatk_test_lanes(): genome_human = mbf_genomes.EnsemblGenome("Homo_sapiens", 96) input_samples = [ [ mbf_align.AlignedSample( "Test1GATK", "/project/code/mvariants/data/base_raw_test_hg36_Subread_gatk_rg.bam", genome_human, is_paired=False, vid=None, ) ], [ mbf_align.AlignedSample( "Test2GATK", "data/base_raw_test_hg3612_Subread_gatk_rg.bam", genome_human, is_paired=False, vid=None, ) ], ] return input_samples
def test_missing_index_file(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) no_index = "noindex.bam" shutil.copy(bam_path, no_index) genome = object() lane = mbf_align.AlignedSample("test_lane", no_index, genome, False, "AA123") assert isinstance(lane.load()[0], ppg.FileInvariant) assert isinstance(lane.load()[1], ppg.FileGeneratingJob) assert lane.load()[1].job_id != "noindex.bam.bai" assert lane.load()[0] in lane.load()[1].prerequisites with pytest.raises(FileNotFoundError): lane.mapped_reads() ppg.run_pipegraph() assert lane.mapped_reads() == 8
def test_creating_index_for_fg_job(self): def gen(): shutil.copy(get_sample_data(Path("mbf_align/ex2.bam")), "sample.bam") ppg.util.global_pipegraph.quiet = False job = ppg.FileGeneratingJob("sample.bam", gen) genome = object() lane = mbf_align.AlignedSample("test_lane", job, genome, False, "AA123") assert isinstance(lane.load()[1], ppg.FileGeneratingJob) assert lane.load()[0] in lane.load()[1].prerequisites ppg.run_pipegraph() assert Path("sample.bam").exists() assert Path("sample.bam.bai").exists()
def test_lane_invariants_on_string(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_path, genome, False, "AA123") assert isinstance(lane.load()[0], ppg.FileInvariant)
def test_lane_raises_on_multifilegeneratingJobWithNoBAM(self): mfg = ppg.MultiFileGeneratingJob(["a.sam"], lambda: 5) genome = object() with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", mfg, genome, False, "AA123")
def test_lane_invariants_on_non_accepted_value(self): genome = object() with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", 123, genome, False, "AA123")
def test_alignment_stats(self): from mbf_sampledata import get_human_22_fake_genome genome = get_human_22_fake_genome() lane = mbf_align.AlignedSample( "test_lane", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", ) # index creation is automatic counts = {"get_bam": 0} def get_bam(): counts["get_bam"] += 1 class DummySam: mapped = 5 unmapped = 10 def __enter__(self): return self def __exit__(self, *args): pass return DummySam() lane.get_bam = get_bam assert lane.get_alignment_stats() == {"Mapped": 5, "Unmapped": 10} assert counts["get_bam"] == 1 class DummyAlignerWithout: pass lane = mbf_align.AlignedSample( "test_lane2", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", aligner=DummyAlignerWithout(), ) # index creation is automatic lane.get_bam = get_bam assert counts["get_bam"] == 1 assert lane.get_alignment_stats() == {"Mapped": 5, "Unmapped": 10} assert counts["get_bam"] == 2 class DummyAlignerWith: def get_alignment_stats(self, bam_filename): assert (Path(bam_filename).resolve() == get_sample_path( "mbf_align/rnaseq_spliced_chr22.bam").resolve()) return {"Hello": 23} lane = mbf_align.AlignedSample( "test_lane3", get_sample_data("mbf_align/rnaseq_spliced_chr22.bam"), genome, False, "AA123", aligner=DummyAlignerWith(), ) # index creation is automatic lane.get_bam = get_bam assert counts["get_bam"] == 2 assert lane.get_alignment_stats() == {"Hello": 23} assert counts["get_bam"] == 2