Example #1
0
def test_align_and_extract_umis(new_pipegraph):
    from mbf_align.post_process import AnnotateFastqBarcodes

    for folder in [
            get_sample_path(Path("mbf_align/sample_extract_barcodes")),
            get_sample_path(Path("mbf_align/sample_extract_barcodes_gz")),
    ]:
        new_pipegraph.new_pipegraph()
        genome = get_human_22_fake_genome()

        mbf_qualitycontrol.prune_qc(lambda _: False)
        r = Sample("test",
                   str(folder),
                   False,
                   pairing="only_second",
                   vid="AA123")
        al = AlignedSample("test", str(folder / "test.bam"), genome, False,
                           "AA123")

        x = al.post_process(
            AnnotateFastqBarcodes(r, {
                "XC": [0, 4],
                "XM": [7, 7 + 4]
            }))
        ppg.run_pipegraph()
        f = x.get_bam()
        r = next(f.fetch())
        print(r.tags)
        assert r.get_tag("XC") == "AGTC"
        assert r.get_tag("XM") == "TGAC"
 def test_pairing_invalid_value(self):
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing="do_what_you_want",
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=False,
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=None,
         )
     with pytest.raises(ValueError):
         Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_a")),
             False,
             pairing=[5],
         )
    def test_lane_paired_only_second(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="only_second",
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert len(temp_job.filenames) == 1
        assert Path(real_job.filenames[0]).exists()
        assert len(real_job.filenames) == 1
        assert not "_R1_" in real_job.filenames[0]
        assert ".fastq.gz" in real_job.filenames[0]

        should = b""
        for input_fn in [
            (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz"))
        ]:
            with gzip.GzipFile(input_fn, "r") as op:
                should += op.read()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            actual = op.read()
        assert actual == should
    def test_lane_paired_filtered(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="paired",
            fastq_processor=fastq2.Paired_Filtered(lambda *args: True),
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert not Path(temp_job.filenames[1]).exists()
        assert Path(real_job.filenames[0]).exists()
        assert Path(real_job.filenames[1]).exists()
        assert "_R1_" in real_job.filenames[0]
        assert "_R2_" in real_job.filenames[1]
        assert ".fastq.gz" in real_job.filenames[0]
        assert ".fastq.gz" in real_job.filenames[1]

        for input_fn, output_fn in zip(
            [
                (get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")),
                (get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")),
            ],
            real_job.filenames,
        ):
            with gzip.GzipFile(output_fn, "r") as op:
                actual = op.read()
            with gzip.GzipFile(input_fn, "r") as op:
                should = op.read()
            assert actual == should
 def test_paired_modes(self):
     with pytest.raises(PairingError):
         lane = Sample(
             "Sample_a",
             get_sample_data(Path("mbf_align/sample_b")),
             False,
             vid="VA000",
         )
         lane.prepare_input()
    def test_fastqc(self):
        from mbf_qualitycontrol import get_qc_jobs

        lane = Sample(
            "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000"
        )
        qc_jobs = list(get_qc_jobs())
        assert len(qc_jobs) == 1
        assert "results/lanes/Sample_a/FASTQC/sentinel.txt" in qc_jobs[0].filenames
        assert lane.prepare_input() in qc_jobs[0].prerequisites
    def test_lane_paired_missing_R2(self):

        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_a")),
            False,
            vid="VA000",
            pairing="paired",
        )
        with pytest.raises(PairingError):
            lane.prepare_input()
    def test_lane(self):

        lane = Sample(
            "Sample_a", get_sample_data(Path("mbf_align/sample_a")), False, vid="VA000"
        )
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert Path(real_job.filenames[0]).exists()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            lines = op.readlines()
            assert len(lines) == 20 + 20
    def test_lane_with_job_generating_fastq(self):
        def gen_fastq(fn):
            with open(fn, "wb") as op:
                op.write(b"@shu\nAGTC\n+\nasdf")

        job = FileGeneratingJob("input.fastq", gen_fastq)

        lane = Sample("Sample_a", job, False, vid="VA000")
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        assert job in temp_job.prerequisites
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert Path(real_job.filenames[0]).exists()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            lines = op.readlines()
        assert len(lines) == 4
Example #10
0
    def test_align_parameterDependencyChecking(self, local_store):
        class FakeGenome:
            name = "FakeGenome"

            def build_index(self, aligner, fasta_to_use=None, gtf_to_use=None):
                job = ppg.FileGeneratingJob(
                    "fake_index", lambda: Path("fake_index").write_text("hello")
                )
                job.output_path = "fake_index"
                return job

        class FakeAligner:
            name = "FakeAligner"
            version = "0.1"

            def align_job(
                self,
                input_fastq,
                paired_end_filename,
                index_basename,
                output_bam_filename,
                parameters,
            ):
                job = ppg.MultiFileGeneratingJob(
                    [output_bam_filename, str(output_bam_filename) + ".bai"], lambda: 5
                )
                # job.depends_on_params("") # that's the line we check
                return job

        aligner = FakeAligner()
        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="paired",
        )
        genome = FakeGenome()
        params = {"shu": 123}
        with pytest.raises(ppg.JobContractError):
            lane.align(aligner, genome, params)
Example #11
0
    def test_align(self, local_store):
        import json
        import gzip

        class FakeGenome:
            name = "FakeGenome"

            def download_genome(self):
                return []

            def job_genes(self):
                return []

            def job_transcripts(self):
                return []

            def build_index(self, aligner, fasta_to_use=None, gtf_to_use=None):
                job = ppg.FileGeneratingJob(
                    "fake_index", lambda: Path("fake_index").write_text("hello")
                )
                job.output_path = "fake_index"
                return job

        class FakeAligner:
            name = "FakeAligner"
            version = "0.1"

            def align_job(
                self,
                input_fastq,
                paired_end_filename,
                index_basename,
                output_bam_filename,
                parameters,
            ):
                def align():
                    with open(output_bam_filename, "w") as op:
                        json.dump(
                            [
                                open(input_fastq).read(200),
                                open(paired_end_filename).read(200)
                                if paired_end_filename
                                else "",
                                index_basename,
                                str(parameters),
                            ],
                            op,
                        )
                    with open(str(output_bam_filename) + ".bai", "w") as op:
                        op.write("Done")

                job = ppg.MultiFileGeneratingJob(
                    [output_bam_filename, str(output_bam_filename) + ".bai"], align
                )
                job.depends_on_params("")
                return job

        aligner = FakeAligner()
        lane = Sample(
            "Sample_a",
            get_sample_data(Path("mbf_align/sample_b")),
            False,
            vid="VA000",
            pairing="paired",
        )
        genome = FakeGenome()
        params = {"shu": 123}
        aligned_lane = lane.align(aligner, genome, params)
        ppg.run_pipegraph()
        assert Path("fake_index").exists()
        assert Path("fake_index").read_text() == "hello"
        assert aligned_lane.load()[0].filenames[0].endswith(lane.name + ".bam")
        assert aligned_lane.load()[0].filenames[1].endswith(lane.name + ".bam.bai")
        assert Path(aligned_lane.load()[0].filenames[0]).exists()
        with open(aligned_lane.load()[0].filenames[0]) as op:
            actual = json.load(op)
        with gzip.GzipFile(
            get_sample_data(Path("mbf_align/sample_b") / "a_R1_.fastq.gz")
        ) as op:
            should_0 = op.read(200).decode("utf-8")
        with gzip.GzipFile(
            get_sample_data(Path("mbf_align/sample_b") / "a_R2_.fastq.gz")
        ) as op:
            should_1 = op.read(200).decode("utf-8")

        assert actual[0] == should_0
        assert actual[1] == should_1
        assert actual[2] == "fake_index"
        assert actual[3] == str(params)
Example #12
0
 def test_lane_raises_on_pe_as_se(self):
     lane = Sample("Sample_a", get_sample_data(Path("mbf_align/sample_b")), False)
     with pytest.raises(PairingError):
         lane.prepare_input()