def test_transcripts_unique_check(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.get_additional_gene_gtfs = lambda: [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.more_transcripts.gtf.gz" ) ] g.download_genome() job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "transcript_stable_ids were not unique" in str(job.exception)
def test_pairing_invalid_value(self): with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing="do_what_you_want", ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=False, ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=None, ) with pytest.raises(ValueError): Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, pairing=[5], )
def test_cdna_creation(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() g.job_transcripts() ppg.run_pipegraph() assert g.find_file("genome.fasta").exists() assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists() tf = g.df_transcripts assert "BAF35033" in tf.index assert tf.loc["BAF35033"].exons == ((1313, 2816), ) should = dict( iter_fasta( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ))) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(g.find_file("cdna.fasta"))) if actual != should: assert not set(should.keys()).difference(set(actual.keys( ))) # they are all here, we just have more (tRNA...) for k in should: assert actual[k] == should[k]
def test_lane_paired_only_second(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="only_second", ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert len(temp_job.filenames) == 1 assert Path(real_job.filenames[0]).exists() assert len(real_job.filenames) == 1 assert not "_R1_" in real_job.filenames[0] assert ".fastq.gz" in real_job.filenames[0] should = b"" for input_fn in [ (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")) ]: with gzip.GzipFile(input_fn, "r") as op: should += op.read() with gzip.GzipFile(real_job.filenames[0], "r") as op: actual = op.read() assert actual == should
def test_lane_paired_filtered(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", pairing="paired", fastq_processor=fastq2.Paired_Filtered(lambda *args: True), ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert not Path(temp_job.filenames[1]).exists() assert Path(real_job.filenames[0]).exists() assert Path(real_job.filenames[1]).exists() assert "_R1_" in real_job.filenames[0] assert "_R2_" in real_job.filenames[1] assert ".fastq.gz" in real_job.filenames[0] assert ".fastq.gz" in real_job.filenames[1] for input_fn, output_fn in zip( [ (get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")), (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")), ], real_job.filenames, ): with gzip.GzipFile(output_fn, "r") as op: actual = op.read() with gzip.GzipFile(input_fn, "r") as op: should = op.read() assert actual == should
def test_FASTQsFromPrefix(): fn1 = Path(get_sample_data(Path("mbf_align/sample_d") / "a_R1_.fastq.gz")) fn2 = Path(get_sample_data(Path("mbf_align/sample_d") / "a_R2_.fastq.gz")) fn_prefix = Path(get_sample_data(Path("mbf_align/sample_d") / "a")) o = FASTQsFromPrefix(fn_prefix) str(o) assert o() == [(fn1.resolve(), fn2.resolve())]
def test_subtraction_by_read(self): from mbf_sampledata import get_human_22_fake_genome genome = get_human_22_fake_genome() lane = mbf_align.AlignedSample( "test_lane", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", ) # index creation is automatic lane2 = mbf_align.AlignedSample( "test_lane2", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA124", ) # index creation is automatic lane3 = mbf_align.AlignedSample( "test_lane3", get_sample_data(Path("mbf_align/chipseq_chr22.bam")), genome, False, "AA123", ) # index creation is automatic lane3_subset = mbf_align.AlignedSample( "test_lane3_subset", get_sample_data(Path("mbf_align/chipseq_chr22_subset.bam")), genome, False, "AA123", ) # index creation is automatic lane_empty = lane.post_process( mbf_align.post_process.SubtractOtherLane(lane2), new_name="empty") lane_full = lane.post_process( mbf_align.post_process.SubtractOtherLane(lane3), new_name="full") lane_some = lane3.post_process( mbf_align.post_process.SubtractOtherLane(lane3_subset), result_dir="results/aligned/shu", ) qc_jobs = [ lane_some.post_processor_qc_jobs, lane_full.post_processor_qc_jobs ] prune_qc(lambda job: job in qc_jobs) ppg.run_pipegraph() assert Path(lane_empty.get_bam_names()[1]).exists() assert Path(lane_full.get_bam_names()[1]).exists() assert lane_empty.mapped_reads() == 0 assert lane_full.mapped_reads() == lane.mapped_reads() assert lane.mapped_reads() != 0 assert (lane_some.mapped_reads() == lane3.mapped_reads() - lane3_subset.mapped_reads()) assert lane3_subset.mapped_reads( ) # make sure there was something to subtract assert "shu" in lane_some.get_bam_names()[0] assert_image_equal(qc_jobs[0].filenames[0], "_result_dir") assert_image_equal(qc_jobs[0].filenames[0])
def test_FASTQsFromFilesPaired_build_strategy(): fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")) fn2 = Path( get_sample_data( Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz" ) ) o = build_fastq_strategy([fn, fn2]) assert o() == [(fn.resolve(), fn2.resolve())]
def test_FASTQsFromFilePaired(): fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")) fn2 = Path( get_sample_data( Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz" ) ) o = FASTQsFromFile(fn, fn2) assert o() == [(fn.resolve(), fn2.resolve())]
def test_fastqs_join(): fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")) fn2 = Path( get_sample_data( Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz" ) ) a = FASTQsFromFiles([fn, fn2]) b = FASTQsFromFile(fn) c = FASTQsFromFile(fn2) d = FASTQsJoin([a, b, c]) o = d() assert o == [(fn.resolve(), fn2.resolve()), (fn.resolve(),), (fn2.resolve(),)]
def test_job_creating_fasta(self, new_pipegraph): new_pipegraph.quiet = False def gen_fasta(): import shutil shutil.copy( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), "shu.fasta.gz", ) fasta_job = ppg.FileGeneratingJob("shu.fasta.gz", gen_fasta) g = FileBasedGenome( "Candidatus_carsonella", fasta_job, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() ppg.run_pipegraph() assert ( g.get_cdna_sequence("BAF35032") == "ATGAATACTATATTTTCAAGAATAACACCATTAGGAAATGGTACGTTATGTGTTATAAGAAT" "TTCTGGAAAAAATGTAAAATTTTTAATACAAAAAATTGTAAAAAAAAATATAAAAGAAAAAATAG" "CTACTTTTTCTAAATTATTTTTAGATAAAGAATGTGTAGATTATGCAATGATTATTTTTTTTAAA" "AAACCAAATACGTTCACTGGAGAAGATATAATCGAATTTCATATTCACAATAATGAAACTATTGT" "AAAAAAAATAATTAATTATTTATTATTAAATAAAGCAAGATTTGCAAAAGCTGGCGAATTTTTAG" "AAAGACGATATTTAAATGGAAAAATTTCTTTAATAGAATGCGAATTAATAAATAATAAAATTTTA" "TATGATAATGAAAATATGTTTCAATTAACAAAAAATTCTGAAAAAAAAATATTTTTATGTATAAT" "TAAAAATTTAAAATTTAAAATAAATTCTTTAATAATTTGTATTGAAATCGCAAATTTTAATTTTA" "GTTTTTTTTTTTTTAATGATTTTTTATTTATAAAATATACATTTAAAAAACTATTAAAACTTTTA" "AAAATATTAATTGATAAAATAACTGTTATAAATTATTTAAAAAAGAATTTCACAATAATGATATT" "AGGTAGAAGAAATGTAGGAAAGTCTACTTTATTTAATAAAATATGTGCACAATATGACTCGATTG" "TAACTAATATTCCTGGTACTACAAAAAATATTATATCAAAAAAAATAAAAATTTTATCTAAAAAA" "ATAAAAATGATGGATACAGCAGGATTAAAAATTAGAACTAAAAATTTAATTGAAAAAATTGGAAT" "TATTAAAAATATAAATAAAATTTATCAAGGAAATTTAATTTTGTATATGATTGATAAATTTAATA" "TTAAAAATATATTTTTTAACATTCCAATAGATTTTATTGATAAAATTAAATTAAATGAATTAATA" "ATTTTAGTTAACAAATCAGATATTTTAGGAAAAGAAGAAGGAGTTTTTAAAATAAAAAATATATT" "AATAATTTTAATTTCTTCTAAAAATGGAACTTTTATAAAAAATTTAAAATGTTTTATTAATAAAA" "TCGTTGATAATAAAGATTTTTCTAAAAATAATTATTCTGATGTTAAAATTCTATTTAATAAATTT" "TCTTTTTTTTATAAAGAATTTTCATGTAACTATGATTTAGTGTTATCAAAATTAATTGATTTTCA" "AAAAAATATATTTAAATTAACAGGAAATTTTACTAATAAAAAAATAATAAATTCTTGTTTTAGAA" "ATTTTTGTATTGGTAAATGA")
def test_filtered_paired(new_pipegraph): import gzip r1_name_found = [False] r1_qual_found = [False] r2_name_found = [False] r2_qual_found = [False] def f(seq1, qual1, name1, seq2, qual2, name2): if name1 == b"HWI-C00113:209:HJCNTBCX2:2:2206:9418:13942 1:N:0:ATCACG": r1_name_found[0] = True if qual1 == b"DDDDDIIIIIIIIIIIIIIIIIIIIII/<FHHIII<<CHHGHHIHHIIIIH": r1_qual_found[0] = True if name2 == b"HWI-C00113:209:HJCNTBCX2:2:2206:10802:17968 1:N:0:ATCACG": r2_name_found[0] = True if qual2 == b"DDDDDIIIIIIIIIIIIIIII1<FHHI/<GHIIII/<EHIIHII/<DHIID": r2_qual_found[0] = True return seq1.startswith(b"G") and seq2.startswith(b"G") x = fastq2.Paired_Filtered(f) of1 = "output_R1.fastq" of2 = "output_R2.fastq" tf1 = open("input_R1_.fastq", "wb") tf2 = open("input_R2_.fastq", "wb") with gzip.GzipFile( get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz"), "rb") as op_in: tf1.write(op_in.read()) tf1.flush() with gzip.GzipFile( get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz"), "rb") as op_in: tf2.write(op_in.read()) tf2.flush() x.generate_aligner_input_paired(of1, of2, [(tf1.name, tf2.name)], False) assert r1_name_found[0] assert r2_name_found[0] assert r1_qual_found[0] assert r2_qual_found[0] actual1 = Path(of1).read_text() actual2 = Path(of2).read_text() assert actual1.count("\n") == 4 assert "@HWI-C00113:209:HJCNTBCX2:2:2206:9559:13855 1:N:0:ATCACG" in actual1 assert "GCCCAATGTTCGAAATTGCTATTCTACGACAAGGTGCCAGATCTCATCTGA" in actual1 assert actual2.count("\n") == 4 assert "@HWI-C00113:209:HJCNTBCX2:2:2206:11052:17798 1:N:0:ATCACG" in actual2 assert "GTCGGTCCTGAGAGATGGGCGGGCGCCGTTCCGAAAGTACGGGCGATGGCC" in actual2
def test_quality_raises_on_0_return(new_pipegraph): with pytest.raises(ValueError): fastq2.QualityFilter(lambda qual, seq: 0).generate_aligner_input( "test.fastq", [str(get_sample_data(Path("mbf_align/sample_a/a.fastq")))], False, )
def test_from_existing_bam(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") assert lane.name == "test_lane" assert lane.load()[0] is bam_job assert isinstance(lane.load()[1], ppg.FileInvariant) assert lane.genome is genome assert not lane.is_paired assert lane.vid == "AA123" with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True, "AA123") assert lane2.is_paired b = lane.get_bam() assert isinstance(b, pysam.Samfile) b = lane.get_unique_aligned_bam() assert isinstance(b, pysam.Samfile) assert lane.get_bam_names()[0] == bam_path assert lane.get_bam_names()[1] == bam_path + ".bai" assert lane.mapped_reads() == 8 assert lane.unmapped_reads() == 0 for job in get_qc_jobs(): assert job._pruned
def test_iter_fasta(): fn = get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ) a = list(iter_fasta(fn)) assert len(a[0][1]) == 159662 b = list(iter_fasta(fn, block_size=10)) assert a == b
def test_paired_modes(self): with pytest.raises(PairingError): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_b")), False, vid="VA000", ) lane.prepare_input()
def gen_fasta(): import shutil shutil.copy( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), "shu.fasta.gz", )
def test_FASTQsFromFolder(): folder = Path(get_sample_data(Path("mbf_align/sample_a"))) o = FASTQsFromFolder(folder) import pprint pprint.pprint(o()) assert o() == [ ((folder / "a.fastq").resolve(),), ((folder / "b.fastq.gz").resolve(),), ]
def test_fastqc(self): from mbf_qualitycontrol import get_qc_jobs lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000" ) qc_jobs = list(get_qc_jobs()) assert len(qc_jobs) == 1 assert "results/lanes/Sample_a/FASTQC/sentinel.txt" in qc_jobs[0].filenames assert lane.prepare_input() in qc_jobs[0].prerequisites
def test_protein_creation(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), None, ProkaryoticCode(), ) g.download_genome() g.job_transcripts() ppg.run_pipegraph() should = dict( iter_fasta( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ))) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(g.find_file("pep.fasta"))) if actual != should: assert not set(should.keys()).difference(set(actual.keys( ))) # they are all here, we just have more (tRNA...) for k in should: if actual[k] != should[k]: print(k) print(len(actual[k])) print(len(should[k])) print(actual[k]) print(should[k]) # print(g.get_cds_sequence(k.decode('utf-8'))) # else: # print('ok', k) # assert actual[k] == should[k] assert False
def test_lane_paired_missing_R2(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000", pairing="paired", ) with pytest.raises(PairingError): lane.prepare_input()
def test_transcript_wrong_order(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "start > stop" in str(job.exception)
def test_transcript_transcript_outside_gene(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_outside_gene.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Transcript outside of gene" in str(job.exception) assert isinstance(job.exception, ValueError)
def test_get_gtf_using_additional_gtf(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.get_additional_gene_gtfs = lambda: [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz" ) ] g.download_genome() j = g.job_genes() for x in j.prerequisites: if hasattr(x, "filenames"): print(x, x.filenames) if (get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz" )) in x.filenames: break else: assert False # wrong preqs ppg.run_pipegraph() assert "TEST1_001" in g.df_genes.index
def test_read_creator_must_be_fastq_right_now(new_pipegraph): with pytest.raises(ValueError): fastq2.Straight().generate_aligner_input( "test.fastq", [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))], False, "fail", ) with pytest.raises(ValueError): fastq2.Filtered(lambda seq, qual, name: True).generate_aligner_input( "test.fastq", [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))], False, "fail", ) with pytest.raises(ValueError): fastq2.QualityFilter(lambda qual, seq: True).generate_aligner_input( "test.fastq", [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))], False, "fail", )
def test_to_fastq(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") fastq_path = "out.fastq" lane.to_fastq(fastq_path) ppg.run_pipegraph() assert Path(fastq_path).exists() assert (Path(fastq_path).read_text() == """@read_28833_29006_6945 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @read_28701_28881_323b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_323c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324a TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< """) lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, is_paired=True, vid="AA123") with pytest.raises(ValueError): lane2.to_fastq( "nope.fastq") # no support for paired end data at this point
def prep_lane(self): from mbf_sampledata import get_human_22_fake_genome # straight from chr22 of the human genome genome = get_human_22_fake_genome() lane = mbf_align.AlignedSample( "test_lane", get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")), genome, False, "AA123", ) return lane
def test_lane(self): lane = Sample( "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000" ) assert lane.vid == "VA000" temp_job = lane.prepare_input() real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert Path(real_job.filenames[0]).exists() with gzip.GzipFile(real_job.filenames[0], "r") as op: lines = op.readlines() assert len(lines) == 20 + 20
def test_missing_index_file(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) no_index = "noindex.bam" shutil.copy(bam_path, no_index) genome = object() lane = mbf_align.AlignedSample("test_lane", no_index, genome, False, "AA123") assert isinstance(lane.load()[0], ppg.FileInvariant) assert isinstance(lane.load()[1], ppg.FileGeneratingJob) assert lane.load()[1].job_id != "noindex.bam.bai" assert lane.load()[0] in lane.load()[1].prerequisites with pytest.raises(FileNotFoundError): lane.mapped_reads() ppg.run_pipegraph() assert lane.mapped_reads() == 8
def test_multiple_fasta_files(self, new_pipegraph): import tempfile tf = tempfile.NamedTemporaryFile(suffix=".fasta") tf.write(b">Extra\nAGTC") tf.flush() g = FileBasedGenome( "Candidatus_carsonella", [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), tf.name, ], get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() ppg.run_pipegraph() assert g.get_genome_sequence("Extra", 0, 4) == "AGTC" assert g.get_chromosome_lengths() == {"Extra": 4, "Chromosome": 159662} # test that changing the fasta leads to an explosion new_pipegraph.new_pipegraph() tf.seek(0, 0) tf.write(b">Extra\nAGTCA") tf.flush() g = FileBasedGenome( "Candidatus_carsonella", [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), tf.name, ], get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, ) g.download_genome() with pytest.raises(UpstreamChangedError): ppg.run_pipegraph()