def test_transcripts_unique_check(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     g.get_additional_gene_gtfs = lambda: [
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.more_transcripts.gtf.gz"
         )
     ]
     g.download_genome()
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "transcript_stable_ids were not unique" in str(job.exception)
Esempio n. 2
0
 def test_pairing_invalid_value(self):
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing="do_what_you_want",
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=False,
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=None,
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=[5],
         )
    def test_cdna_creation(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        g.job_transcripts()
        ppg.run_pipegraph()
        assert g.find_file("genome.fasta").exists()
        assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists()
        tf = g.df_transcripts
        assert "BAF35033" in tf.index
        assert tf.loc["BAF35033"].exons == ((1313, 2816), )

        should = dict(
            iter_fasta(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
                )))
        should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
        actual = dict(iter_fasta(g.find_file("cdna.fasta")))
        if actual != should:
            assert not set(should.keys()).difference(set(actual.keys(
            )))  # they are all here, we just have more (tRNA...)
            for k in should:
                assert actual[k] == should[k]
Esempio n. 4
0
    def test_lane_paired_only_second(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="only_second",
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert len(temp_job.filenames) == 1
        assert Path(real_job.filenames[0]).exists()
        assert len(real_job.filenames) == 1
        assert not "_R1_" in real_job.filenames[0]
        assert ".fastq.gz" in real_job.filenames[0]

        should = b""
        for input_fn in [
            (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz"))
        ]:
            with gzip.GzipFile(input_fn, "r") as op:
                should += op.read()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            actual = op.read()
        assert actual == should
Esempio n. 5
0
    def test_lane_paired_filtered(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="paired",
            fastq_processor=fastq2.Paired_Filtered(lambda *args: True),
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert not Path(temp_job.filenames[1]).exists()
        assert Path(real_job.filenames[0]).exists()
        assert Path(real_job.filenames[1]).exists()
        assert "_R1_" in real_job.filenames[0]
        assert "_R2_" in real_job.filenames[1]
        assert ".fastq.gz" in real_job.filenames[0]
        assert ".fastq.gz" in real_job.filenames[1]

        for input_fn, output_fn in zip(
            [
                (get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")),
                (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")),
            ],
            real_job.filenames,
        ):
            with gzip.GzipFile(output_fn, "r") as op:
                actual = op.read()
            with gzip.GzipFile(input_fn, "r") as op:
                should = op.read()
            assert actual == should
Esempio n. 6
0
def test_FASTQsFromPrefix():
    fn1 = Path(get_sample_data(Path("mbf_align/sample_d") / "a_R1_.fastq.gz"))
    fn2 = Path(get_sample_data(Path("mbf_align/sample_d") / "a_R2_.fastq.gz"))
    fn_prefix = Path(get_sample_data(Path("mbf_align/sample_d") / "a"))
    o = FASTQsFromPrefix(fn_prefix)
    str(o)
    assert o() == [(fn1.resolve(), fn2.resolve())]
Esempio n. 7
0
    def test_subtraction_by_read(self):
        from mbf_sampledata import get_human_22_fake_genome

        genome = get_human_22_fake_genome()
        lane = mbf_align.AlignedSample(
            "test_lane",
            get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")),
            genome,
            False,
            "AA123",
        )  # index creation is automatic
        lane2 = mbf_align.AlignedSample(
            "test_lane2",
            get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")),
            genome,
            False,
            "AA124",
        )  # index creation is automatic
        lane3 = mbf_align.AlignedSample(
            "test_lane3",
            get_sample_data(Path("mbf_align/chipseq_chr22.bam")),
            genome,
            False,
            "AA123",
        )  # index creation is automatic
        lane3_subset = mbf_align.AlignedSample(
            "test_lane3_subset",
            get_sample_data(Path("mbf_align/chipseq_chr22_subset.bam")),
            genome,
            False,
            "AA123",
        )  # index creation is automatic

        lane_empty = lane.post_process(
            mbf_align.post_process.SubtractOtherLane(lane2), new_name="empty")
        lane_full = lane.post_process(
            mbf_align.post_process.SubtractOtherLane(lane3), new_name="full")
        lane_some = lane3.post_process(
            mbf_align.post_process.SubtractOtherLane(lane3_subset),
            result_dir="results/aligned/shu",
        )
        qc_jobs = [
            lane_some.post_processor_qc_jobs, lane_full.post_processor_qc_jobs
        ]
        prune_qc(lambda job: job in qc_jobs)
        ppg.run_pipegraph()
        assert Path(lane_empty.get_bam_names()[1]).exists()
        assert Path(lane_full.get_bam_names()[1]).exists()
        assert lane_empty.mapped_reads() == 0
        assert lane_full.mapped_reads() == lane.mapped_reads()
        assert lane.mapped_reads() != 0
        assert (lane_some.mapped_reads() == lane3.mapped_reads() -
                lane3_subset.mapped_reads())
        assert lane3_subset.mapped_reads(
        )  # make sure there was something to subtract
        assert "shu" in lane_some.get_bam_names()[0]
        assert_image_equal(qc_jobs[0].filenames[0], "_result_dir")
        assert_image_equal(qc_jobs[0].filenames[0])
Esempio n. 8
0
def test_FASTQsFromFilesPaired_build_strategy():
    fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz"))
    fn2 = Path(
        get_sample_data(
            Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz"
        )
    )
    o = build_fastq_strategy([fn, fn2])
    assert o() == [(fn.resolve(), fn2.resolve())]
Esempio n. 9
0
def test_FASTQsFromFilePaired():
    fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz"))
    fn2 = Path(
        get_sample_data(
            Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz"
        )
    )
    o = FASTQsFromFile(fn, fn2)
    assert o() == [(fn.resolve(), fn2.resolve())]
Esempio n. 10
0
def test_fastqs_join():
    fn = Path(get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz"))
    fn2 = Path(
        get_sample_data(
            Path("mbf_align/sample_b") / ".." / "sample_b" / "a_R2_.fastq.gz"
        )
    )
    a = FASTQsFromFiles([fn, fn2])
    b = FASTQsFromFile(fn)
    c = FASTQsFromFile(fn2)
    d = FASTQsJoin([a, b, c])
    o = d()
    assert o == [(fn.resolve(), fn2.resolve()), (fn.resolve(),), (fn2.resolve(),)]
    def test_job_creating_fasta(self, new_pipegraph):
        new_pipegraph.quiet = False

        def gen_fasta():
            import shutil

            shutil.copy(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                "shu.fasta.gz",
            )

        fasta_job = ppg.FileGeneratingJob("shu.fasta.gz", gen_fasta)
        g = FileBasedGenome(
            "Candidatus_carsonella",
            fasta_job,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        ppg.run_pipegraph()
        assert (
            g.get_cdna_sequence("BAF35032") ==
            "ATGAATACTATATTTTCAAGAATAACACCATTAGGAAATGGTACGTTATGTGTTATAAGAAT"
            "TTCTGGAAAAAATGTAAAATTTTTAATACAAAAAATTGTAAAAAAAAATATAAAAGAAAAAATAG"
            "CTACTTTTTCTAAATTATTTTTAGATAAAGAATGTGTAGATTATGCAATGATTATTTTTTTTAAA"
            "AAACCAAATACGTTCACTGGAGAAGATATAATCGAATTTCATATTCACAATAATGAAACTATTGT"
            "AAAAAAAATAATTAATTATTTATTATTAAATAAAGCAAGATTTGCAAAAGCTGGCGAATTTTTAG"
            "AAAGACGATATTTAAATGGAAAAATTTCTTTAATAGAATGCGAATTAATAAATAATAAAATTTTA"
            "TATGATAATGAAAATATGTTTCAATTAACAAAAAATTCTGAAAAAAAAATATTTTTATGTATAAT"
            "TAAAAATTTAAAATTTAAAATAAATTCTTTAATAATTTGTATTGAAATCGCAAATTTTAATTTTA"
            "GTTTTTTTTTTTTTAATGATTTTTTATTTATAAAATATACATTTAAAAAACTATTAAAACTTTTA"
            "AAAATATTAATTGATAAAATAACTGTTATAAATTATTTAAAAAAGAATTTCACAATAATGATATT"
            "AGGTAGAAGAAATGTAGGAAAGTCTACTTTATTTAATAAAATATGTGCACAATATGACTCGATTG"
            "TAACTAATATTCCTGGTACTACAAAAAATATTATATCAAAAAAAATAAAAATTTTATCTAAAAAA"
            "ATAAAAATGATGGATACAGCAGGATTAAAAATTAGAACTAAAAATTTAATTGAAAAAATTGGAAT"
            "TATTAAAAATATAAATAAAATTTATCAAGGAAATTTAATTTTGTATATGATTGATAAATTTAATA"
            "TTAAAAATATATTTTTTAACATTCCAATAGATTTTATTGATAAAATTAAATTAAATGAATTAATA"
            "ATTTTAGTTAACAAATCAGATATTTTAGGAAAAGAAGAAGGAGTTTTTAAAATAAAAAATATATT"
            "AATAATTTTAATTTCTTCTAAAAATGGAACTTTTATAAAAAATTTAAAATGTTTTATTAATAAAA"
            "TCGTTGATAATAAAGATTTTTCTAAAAATAATTATTCTGATGTTAAAATTCTATTTAATAAATTT"
            "TCTTTTTTTTATAAAGAATTTTCATGTAACTATGATTTAGTGTTATCAAAATTAATTGATTTTCA"
            "AAAAAATATATTTAAATTAACAGGAAATTTTACTAATAAAAAAATAATAAATTCTTGTTTTAGAA"
            "ATTTTTGTATTGGTAAATGA")
Esempio n. 12
0
def test_filtered_paired(new_pipegraph):
    import gzip

    r1_name_found = [False]
    r1_qual_found = [False]
    r2_name_found = [False]
    r2_qual_found = [False]

    def f(seq1, qual1, name1, seq2, qual2, name2):
        if name1 == b"HWI-C00113:209:HJCNTBCX2:2:2206:9418:13942 1:N:0:ATCACG":
            r1_name_found[0] = True
        if qual1 == b"DDDDDIIIIIIIIIIIIIIIIIIIIII/<FHHIII<<CHHGHHIHHIIIIH":
            r1_qual_found[0] = True
        if name2 == b"HWI-C00113:209:HJCNTBCX2:2:2206:10802:17968 1:N:0:ATCACG":
            r2_name_found[0] = True
        if qual2 == b"DDDDDIIIIIIIIIIIIIIII1<FHHI/<GHIIII/<EHIIHII/<DHIID":
            r2_qual_found[0] = True
        return seq1.startswith(b"G") and seq2.startswith(b"G")

    x = fastq2.Paired_Filtered(f)
    of1 = "output_R1.fastq"
    of2 = "output_R2.fastq"
    tf1 = open("input_R1_.fastq", "wb")
    tf2 = open("input_R2_.fastq", "wb")
    with gzip.GzipFile(
            get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz"),
            "rb") as op_in:
        tf1.write(op_in.read())
        tf1.flush()
    with gzip.GzipFile(
            get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz"),
            "rb") as op_in:
        tf2.write(op_in.read())
        tf2.flush()

    x.generate_aligner_input_paired(of1, of2, [(tf1.name, tf2.name)], False)
    assert r1_name_found[0]
    assert r2_name_found[0]
    assert r1_qual_found[0]
    assert r2_qual_found[0]
    actual1 = Path(of1).read_text()
    actual2 = Path(of2).read_text()
    assert actual1.count("\n") == 4
    assert "@HWI-C00113:209:HJCNTBCX2:2:2206:9559:13855 1:N:0:ATCACG" in actual1
    assert "GCCCAATGTTCGAAATTGCTATTCTACGACAAGGTGCCAGATCTCATCTGA" in actual1
    assert actual2.count("\n") == 4
    assert "@HWI-C00113:209:HJCNTBCX2:2:2206:11052:17798 1:N:0:ATCACG" in actual2
    assert "GTCGGTCCTGAGAGATGGGCGGGCGCCGTTCCGAAAGTACGGGCGATGGCC" in actual2
Esempio n. 13
0
def test_quality_raises_on_0_return(new_pipegraph):
    with pytest.raises(ValueError):
        fastq2.QualityFilter(lambda qual, seq: 0).generate_aligner_input(
            "test.fastq",
            [str(get_sample_data(Path("mbf_align/sample_a/a.fastq")))],
            False,
        )
Esempio n. 14
0
    def test_from_existing_bam(self):
        bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
        bam_job = ppg.FileInvariant(bam_path)
        genome = object()
        lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                       "AA123")
        assert lane.name == "test_lane"
        assert lane.load()[0] is bam_job
        assert isinstance(lane.load()[1], ppg.FileInvariant)
        assert lane.genome is genome
        assert not lane.is_paired
        assert lane.vid == "AA123"

        with pytest.raises(ValueError):
            mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                    "AA123")
        lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True,
                                        "AA123")
        assert lane2.is_paired

        b = lane.get_bam()
        assert isinstance(b, pysam.Samfile)
        b = lane.get_unique_aligned_bam()
        assert isinstance(b, pysam.Samfile)
        assert lane.get_bam_names()[0] == bam_path
        assert lane.get_bam_names()[1] == bam_path + ".bai"

        assert lane.mapped_reads() == 8
        assert lane.unmapped_reads() == 0
        for job in get_qc_jobs():
            assert job._pruned
Esempio n. 15
0
def test_iter_fasta():
    fn = get_sample_data(
        "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
    )
    a = list(iter_fasta(fn))
    assert len(a[0][1]) == 159662
    b = list(iter_fasta(fn, block_size=10))
    assert a == b
Esempio n. 16
0
 def test_paired_modes(self):
     with pytest.raises(PairingError):
         lane = Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_b")),
             False,
             vid="VA000",
         )
         lane.prepare_input()
        def gen_fasta():
            import shutil

            shutil.copy(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                "shu.fasta.gz",
            )
Esempio n. 18
0
def test_FASTQsFromFolder():
    folder = Path(get_sample_data(Path("mbf_align/sample_a")))
    o = FASTQsFromFolder(folder)
    import pprint

    pprint.pprint(o())
    assert o() == [
        ((folder / "a.fastq").resolve(),),
        ((folder / "b.fastq.gz").resolve(),),
    ]
Esempio n. 19
0
    def test_fastqc(self):
        from mbf_qualitycontrol import get_qc_jobs

        lane = Sample(
            "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000"
        )
        qc_jobs = list(get_qc_jobs())
        assert len(qc_jobs) == 1
        assert "results/lanes/Sample_a/FASTQC/sentinel.txt" in qc_jobs[0].filenames
        assert lane.prepare_input() in qc_jobs[0].prerequisites
    def test_protein_creation(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            None,
            ProkaryoticCode(),
        )
        g.download_genome()
        g.job_transcripts()
        ppg.run_pipegraph()

        should = dict(
            iter_fasta(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
                )))
        should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
        actual = dict(iter_fasta(g.find_file("pep.fasta")))
        if actual != should:
            assert not set(should.keys()).difference(set(actual.keys(
            )))  # they are all here, we just have more (tRNA...)
            for k in should:
                if actual[k] != should[k]:
                    print(k)
                    print(len(actual[k]))
                    print(len(should[k]))

                    print(actual[k])
                    print(should[k])
                    # print(g.get_cds_sequence(k.decode('utf-8')))
                # else:
                # print('ok', k)
                # assert actual[k] == should[k]
            assert False
Esempio n. 21
0
    def test_lane_paired_missing_R2(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_a")),
            False,
            vid="VA000",
            pairing="paired",
        )
        with pytest.raises(PairingError):
            lane.prepare_input()
 def test_transcript_wrong_order(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "start > stop" in str(job.exception)
 def test_transcript_transcript_outside_gene(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_outside_gene.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "Transcript outside of gene" in str(job.exception)
     assert isinstance(job.exception, ValueError)
 def test_get_gtf_using_additional_gtf(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     g.get_additional_gene_gtfs = lambda: [
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz"
         )
     ]
     g.download_genome()
     j = g.job_genes()
     for x in j.prerequisites:
         if hasattr(x, "filenames"):
             print(x, x.filenames)
             if (get_sample_data(
                     "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz"
             )) in x.filenames:
                 break
     else:
         assert False  # wrong preqs
     ppg.run_pipegraph()
     assert "TEST1_001" in g.df_genes.index
Esempio n. 25
0
def test_read_creator_must_be_fastq_right_now(new_pipegraph):
    with pytest.raises(ValueError):
        fastq2.Straight().generate_aligner_input(
            "test.fastq",
            [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))],
            False,
            "fail",
        )
    with pytest.raises(ValueError):
        fastq2.Filtered(lambda seq, qual, name: True).generate_aligner_input(
            "test.fastq",
            [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))],
            False,
            "fail",
        )
    with pytest.raises(ValueError):
        fastq2.QualityFilter(lambda qual, seq: True).generate_aligner_input(
            "test.fastq",
            [str(get_sample_data(Path("mbf_align/sample_a)/a.fastq")))],
            False,
            "fail",
        )
Esempio n. 26
0
    def test_to_fastq(self):
        bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
        bam_job = ppg.FileInvariant(bam_path)
        genome = object()
        lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                       "AA123")
        fastq_path = "out.fastq"
        lane.to_fastq(fastq_path)
        ppg.run_pipegraph()
        assert Path(fastq_path).exists()
        assert (Path(fastq_path).read_text() == """@read_28833_29006_6945
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
@read_28701_28881_323b
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_323c
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324a
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324b
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324c
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@test_clipped1
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
@test_clipped1
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
""")
        lane2 = mbf_align.AlignedSample("test_lane2",
                                        bam_job,
                                        genome,
                                        is_paired=True,
                                        vid="AA123")
        with pytest.raises(ValueError):
            lane2.to_fastq(
                "nope.fastq")  # no support for paired end data at this point
Esempio n. 27
0
    def prep_lane(self):
        from mbf_sampledata import get_human_22_fake_genome

        # straight from chr22 of the human genome
        genome = get_human_22_fake_genome()

        lane = mbf_align.AlignedSample(
            "test_lane",
            get_sample_data(Path("mbf_align/rnaseq_spliced_chr22.bam")),
            genome,
            False,
            "AA123",
        )
        return lane
Esempio n. 28
0
    def test_lane(self):

        lane = Sample(
            "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000"
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert Path(real_job.filenames[0]).exists()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            lines = op.readlines()
            assert len(lines) == 20 + 20
Esempio n. 29
0
 def test_missing_index_file(self):
     bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
     no_index = "noindex.bam"
     shutil.copy(bam_path, no_index)
     genome = object()
     lane = mbf_align.AlignedSample("test_lane", no_index, genome, False,
                                    "AA123")
     assert isinstance(lane.load()[0], ppg.FileInvariant)
     assert isinstance(lane.load()[1], ppg.FileGeneratingJob)
     assert lane.load()[1].job_id != "noindex.bam.bai"
     assert lane.load()[0] in lane.load()[1].prerequisites
     with pytest.raises(FileNotFoundError):
         lane.mapped_reads()
     ppg.run_pipegraph()
     assert lane.mapped_reads() == 8
    def test_multiple_fasta_files(self, new_pipegraph):
        import tempfile

        tf = tempfile.NamedTemporaryFile(suffix=".fasta")
        tf.write(b">Extra\nAGTC")
        tf.flush()
        g = FileBasedGenome(
            "Candidatus_carsonella",
            [
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                tf.name,
            ],
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        ppg.run_pipegraph()
        assert g.get_genome_sequence("Extra", 0, 4) == "AGTC"
        assert g.get_chromosome_lengths() == {"Extra": 4, "Chromosome": 159662}

        # test that changing the fasta leads to an explosion
        new_pipegraph.new_pipegraph()
        tf.seek(0, 0)
        tf.write(b">Extra\nAGTCA")
        tf.flush()
        g = FileBasedGenome(
            "Candidatus_carsonella",
            [
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                tf.name,
            ],
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
        )
        g.download_genome()
        with pytest.raises(UpstreamChangedError):
            ppg.run_pipegraph()