def test_build_and_align_paired_end(self, new_pipegraph, per_run_store):
     new_pipegraph.quiet = False
     s = Subread()
     data_path = Path(__file__).parent / "sample_data"
     index_name = Path("subread_index_dir/srf")
     build_job = s.build_index_job(data_path / "genome.fasta", None,
                                   index_name)
     align_job = s.align_job(
         data_path / "sample_R1_.fastq",
         data_path / "sample_R2_.fastq",
         index_name,
         "out/out.bam",
         {"input_type": "rna"},
     )
     align_job.depends_on(build_job)
     new_pipegraph.run()
     assert (Path("out") / "out.bam").exists()
 def test_build_and_align(self, new_pipegraph, per_run_store):
     new_pipegraph.quiet = False
     s = Subread()
     data_path = Path(__file__).parent / "sample_data"
     index_name = Path("subread_index_dir/srf")
     build_job = s.build_index_job([data_path / "genome.fasta"], None,
                                   index_name)
     align_job = s.align_job(
         data_path / "sample.fastq",
         None,
         index_name,
         "out/out.bam",
         {"input_type": "dna"},
     )
     align_job.depends_on(build_job)
     new_pipegraph.run()
     assert (Path("out") / "out.bam").exists()
     assert s.get_alignment_stats((Path("out") / "out.bam")) == {
         "Uniquely mapped": 1,
         "Multi-mapping": 0,
         "Unmapped": 0,
     }
 def test_raises_on_invalid_input_type(self, new_pipegraph, per_run_store):
     data_path = Path(__file__).parent / "sample_data"
     index_name = Path("subread_index_dir/srf")
     s = Subread()
     with pytest.raises(ValueError):
         s.align_job(data_path / "sample.fastq", None, index_name,
                     "out/out.bam", {})
     with pytest.raises(ValueError):
         s.align_job(
             data_path / "sample.fastq",
             None,
             index_name,
             "out/out.bam",
             {"input_type": "shu"},
         )
 def test_get_index_version_range(self, new_pipegraph, per_run_store):
     s = Subread(version="1.4.3-p1")
     assert s.get_index_version_range() == ("0.1", "1.5.99")
     s = Subread(version="1.6.3")
     assert s.get_index_version_range() == ("1.6", None)
 def test_cant_call_subread_run2(self, new_pipegraph, per_run_store):
     s = Subread()
     s.run("out", ["subread-align", "something"])
     with pytest.raises(ValueError):
         new_pipegraph.run()
 def test_cant_call_subread_run(self, new_pipegraph, per_run_store):
     s = Subread()
     s.run("out", None)
     with pytest.raises(ValueError):
         new_pipegraph.run()
 def test_subread_older_download(self, new_pipegraph, per_test_store):
     s = Subread('1.5.0')
     assert s.version == '1.5.0'
     s.store.unpack_version(s.name, s.version)
     assert s.path.exists()
 def test_build_index(self, new_pipegraph, per_run_store):
     s = Subread('_latest')
     data_path = Path(__file__).parent / "sample_data"
     s.build_index([data_path / "genome.fasta"], None, "shu")
     assert Path("shu/stdout.txt").exists()
     assert Path("shu/subread_index.reads").exists()
    def test_download(self, new_pipegraph, mock_download, shared_prebuild):
        species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
        g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)

        def shorten_genome_fasta(output_path):
            with open(g.find_file("genome.fasta")) as op:
                head = op.read(1024 * 100)
            (output_path / "test.fasta").write_text(head)

        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        test_fasta_job.depends_on(g.download_genome())
        g._prebuilds.append(test_fasta_job)

        subread = Subread(version="1.6.3")
        index = g.build_index(subread, "test.fasta")
        subread_old = Subread(version="1.4.3-p1")
        index_old = g.build_index(subread_old, "test.fasta")

        new_pipegraph.run()
        # note that these are not the checksums from CHECKSUMS files (those are fore
        # the gziped variants, we keep them ungziped and let the filesystem handle
        # the gzip, since we can't rely on the downstream reading gzip...
        assert (checksum_file(
            g.find_file("genome.fasta")) == "584a734589964a654c7c1dc23b0167ab")
        assert (checksum_file(
            g.find_file("cdna.fasta")) == "3fc1f19ab829573169cb2488abe39211")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("pep.fasta")) == "9580fd44832d419c38469d657f6e2484")
        with pytest.raises(OSError):
            g.find_file("no such file")
        assert index.name_file("subread_index.reads").exists()
        assert index.name_file("subread_index.files").exists()
        assert index.name_file("subread_index.00.b.array").exists()
        assert index_old.name_file("subread_index.reads").exists()
        assert index_old.name_file("subread_index.files").exists()
        assert index_old.name_file("subread_index.00.b.array").exists()
        assert index.name_file("subread_index.reads") != index_old.name_file(
            "subread_index.reads")
        assert g.find_file("test.fasta.md5sum").exists()
        with pytest.raises(OSError):
            assert g.find_file("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_prebuild("test.fasta") is test_fasta_job
        with pytest.raises(OSError):
            assert g.find_prebuild("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_file("genome.fasta.fai").exists()
        assert g.find_file("cdna.fasta.fai").exists()

        new_pipegraph.new_pipegraph()
        pb = PrebuildManager(shared_prebuild.prebuilt_path)
        g = EnsemblGenome(species, "41", prebuild_manager=pb)
        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        g._prebuilds.append(test_fasta_job)

        subread_intermediate = Subread(version="1.5.0")
        index_intermediate = g.build_index(subread_intermediate, "test.fasta")
        assert index_intermediate.name_file(
            "subread_index.reads") == index_old.name_file(
                "subread_index.reads")
        index_genome = g.build_index(subread_intermediate)
        assert "/genome/" in str(index_genome.filenames[0])

        assert g.get_chromosome_lengths() == {
            "IV": 1_467_287,
            "MT": 23564,
            "V": 1_519_140,
            "III": 907_494,
            "II": 870_771,
            "VII": 1_800_949,
            "I": 693_414,
            "VI": 1_836_693,
        }

        assert g.get_genome_sequence("VI", 20, 30) == "ACCGCTGAGA"
        assert (g.get_cdna_sequence("EFAGOT00000000349") ==
                "GCTCGCGTGGCGTAATGGCAACGCGTCTGACTTCTAATCAGAAGATTGTGGGTTCGACCC"
                "CCACCGTGAGTG")
        assert (g.get_protein_sequence("AAS53315") ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        assert (g.genetic_code.translate_dna(g.get_cds_sequence("AAS53315")) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")

        assert (g.genetic_code.translate_dna(
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53315"])) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        with pytest.raises(ValueError):
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53316"])

        assert (
            g.df_genes_meta.loc["AGOS_ADL186C"]["description"] ==
            "Restriction of telomere capping protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q75AV6]"
        )