コード例 #1
0
ファイル: test_resources.py プロジェクト: sanogenetics/snps
class TestResources(BaseSNPsTestCase):
    def _reset_resource(self):
        self.resource._reference_sequences = {}
        self.resource._gsa_resources = {}
        self.resource._opensnp_datadump_filenames = []

    def run(self, result=None):
        # set resources directory based on if downloads are being performed
        # https://stackoverflow.com/a/11180583

        self.resource = Resources()
        self._reset_resource()
        if self.downloads_enabled:
            self.resource._resources_dir = "resources"
            super().run(result)
        else:
            # use a temporary directory for test resource data
            with tempfile.TemporaryDirectory() as tmpdir:
                self.resource._resources_dir = tmpdir
                super().run(result)
                self.resource._resources_dir = "resources"

    def test_get_assembly_mapping_data(self):
        def f():
            effects = [{"mappings": []} for _ in range(1, 26)]
            for k, v in self.NCBI36_GRCh37().items():
                effects[int(k) - 1] = v
            mock = Mock(side_effect=effects)
            with patch("snps.ensembl.EnsemblRestClient.perform_rest_action",
                       mock):
                return self.resource.get_assembly_mapping_data(
                    "NCBI36", "GRCh37")

        assembly_mapping_data = (self.resource.get_assembly_mapping_data(
            "NCBI36", "GRCh37") if self.downloads_enabled else f())

        self.assertEqual(len(assembly_mapping_data), 25)

    def test_get_gsa_resources(self):
        def f():
            # mock download of test data for each resource
            self._generate_test_gsa_resources()
            # load test resources saved to `tmpdir`
            return self.resource.get_gsa_resources()

        gsa_resources = (self.resource.get_gsa_resources()
                         if self.downloads_enabled else f())

        self.assertEqual(len(gsa_resources["rsid_map"]), 618539)
        self.assertEqual(len(gsa_resources["chrpos_map"]), 665607)

        # cleanup these test resources so other tests can use the file resources
        if os.path.exists("resources"):
            shutil.rmtree("resources")
        Singleton._instances = {}

    def _generate_test_gsa_resources(self):
        # Name RsID"
        s = ""
        for i in range(1, 618541):
            s += f"rs{i}\trs{i}\n"
        mock = mock_open(read_data=gzip.compress(s.encode()))
        with patch("urllib.request.urlopen", mock):
            self.resource.get_gsa_rsid()

        # Name Chr MapInfo deCODE(cM)
        s = ""
        for i in range(1, 665609):
            s += f"rs{i}\t1\t{i}\t0.0000\n"

        mock = mock_open(read_data=gzip.compress(s.encode()))
        with patch("urllib.request.urlopen", mock):
            self.resource.get_gsa_chrpos()

    def test_get_all_resources(self):
        def f():
            # mock download of test data for each resource
            self._generate_test_gsa_resources()

            # generate test data for permutations of remapping data
            effects = [{"mappings": []} for _ in range(1, 26)]
            for k, v in self.NCBI36_GRCh37().items():
                effects[int(k) - 1] = v
            mock = Mock(side_effect=effects * 6)
            with patch("snps.ensembl.EnsemblRestClient.perform_rest_action",
                       mock):
                return self.resource.get_all_resources()

        resources = self.resource.get_all_resources(
        ) if self.downloads_enabled else f()

        for k, v in resources.items():
            self.assertGreater(len(v), 0)

        # cleanup these test resources so other tests can use the file resources
        if os.path.exists("resources"):
            shutil.rmtree("resources")
        Singleton._instances = {}

    def test_download_example_datasets(self):
        def f():
            with patch("urllib.request.urlopen", mock_open(read_data=b"")):
                return self.resource.download_example_datasets()

        paths = (self.resource.download_example_datasets()
                 if self.downloads_enabled else f())

        for path in paths:
            if not path or not os.path.exists(path):
                warnings.warn("Example dataset(s) not currently available")
                return

    def test_get_paths_reference_sequences_invalid_assembly(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="36")
        self.assertFalse(assembly)
        self.assertFalse(chroms)
        self.assertFalse(urls)
        self.assertFalse(paths)

    def run_reference_sequences_test(self, f, assembly="GRCh37"):
        if self.downloads_enabled:
            f()
        else:
            s = f">MT dna:chromosome chromosome:{assembly}:MT:1:16569:1 REF\n"
            for i in range(276):
                s += "A" * 60
                s += "\n"
            s += "A" * 9
            s += "\n"
            with patch("urllib.request.urlopen",
                       mock_open(read_data=gzip.compress(s.encode()))):
                f()

    def run_create_reference_sequences_test(self, assembly_expect, url_expect):
        def f():
            (
                assembly,
                chroms,
                urls,
                paths,
            ) = self.resource._get_paths_reference_sequences(
                assembly=assembly_expect, chroms=["MT"])
            seqs = self.resource._create_reference_sequences(
                assembly, chroms, urls, paths)
            self.assertEqual(len(seqs), 1)
            self.assertEqual(
                seqs["MT"].__repr__(),
                f"ReferenceSequence(assembly='{assembly_expect}', ID='MT')",
            )
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(seqs["MT"].url, f"{url_expect}")
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    f'{os.path.join(self.resource._resources_dir,"fasta", assembly_expect,os.path.basename(url_expect))}'
                ),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, assembly_expect)
            self.assertEqual(seqs["MT"].build, f"B{assembly_expect[-2:]}")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f, assembly_expect)

    def test_create_reference_sequences_NCBI36(self):
        self.run_create_reference_sequences_test(
            "NCBI36",
            "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_GRCh37(self):
        self.run_create_reference_sequences_test(
            "GRCh37",
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_GRCh38(self):
        self.run_create_reference_sequences_test(
            "GRCh38",
            "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_invalid_path(self):
        def f():
            (
                assembly,
                chroms,
                urls,
                paths,
            ) = self.resource._get_paths_reference_sequences(assembly="GRCh37",
                                                             chroms=["MT"])
            paths[0] = ""
            seqs = self.resource._create_reference_sequences(
                assembly, chroms, urls, paths)
            self.assertEqual(len(seqs), 0)

        self.run_reference_sequences_test(f)

    def test_download_file_socket_timeout(self):
        mock = Mock(side_effect=socket.timeout)
        with patch("urllib.request.urlopen", mock):
            path = self.resource._download_file("http://url", "test.txt")
        self.assertEqual(path, "")

    def test_download_file_URL_error(self):
        mock = Mock(side_effect=urllib.error.URLError("test error"))
        with patch("urllib.request.urlopen", mock):
            path1 = self.resource._download_file("http://url", "test.txt")
            path2 = self.resource._download_file("ftp://url", "test.txt")
        self.assertEqual(path1, "")
        self.assertEqual(path2, "")

    def test_get_reference_sequences(self):
        def f():
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 1)
            self.assertEqual(seqs["MT"].__repr__(),
                             "ReferenceSequence(assembly='GRCh37', ID='MT')")
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(
                seqs["MT"].url,
                "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
            )
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    f'{os.path.join(self.resource._resources_dir,"fasta", "GRCh37","Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz")}'
                ),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, "GRCh37")
            self.assertEqual(seqs["MT"].build, "B37")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f)

    def test_get_all_reference_sequences(self):
        def f():
            seqs = self.resource.get_all_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 3)
            self.assertEqual(len(seqs["NCBI36"]), 1)
            self.assertEqual(
                seqs["NCBI36"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "NCBI36",
                        "Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertEqual(len(seqs["GRCh37"]), 1)
            self.assertEqual(
                seqs["GRCh37"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh37",
                        "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertEqual(len(seqs["GRCh38"]), 1)
            self.assertEqual(
                seqs["GRCh38"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh38",
                        "Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz",
                    )),
            )

        self.run_reference_sequences_test(f)

    def test_get_reference_sequences_invalid_assembly(self):
        seqs = self.resource.get_reference_sequences(assembly="36")
        self.assertEqual(len(seqs), 0)

    def test_get_reference_sequences_chrom_not_available(self):
        def f():
            self.resource.get_reference_sequences(chroms=["MT"])
            del self.resource._reference_sequences["GRCh37"]["MT"]
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 1)
            self.assertEqual(seqs["MT"].__repr__(),
                             "ReferenceSequence(assembly='GRCh37', ID='MT')")
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(
                seqs["MT"].url,
                "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
            )
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh37",
                        "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, "GRCh37")
            self.assertEqual(seqs["MT"].build, "B37")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f)

    def run_reference_sequence_load_sequence_test(self, hash):
        def f():
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs["MT"].sequence), 16569)
            self.assertEqual(seqs["MT"].md5, hash)
            self.assertEqual(seqs["MT"].start, 1)
            self.assertEqual(seqs["MT"].end, 16569)
            self.assertEqual(seqs["MT"].length, 16569)

            seqs["MT"].clear()
            self.assertEqual(seqs["MT"]._sequence.size, 0)
            self.assertEqual(seqs["MT"]._md5, "")
            self.assertEqual(seqs["MT"]._start, 0)
            self.assertEqual(seqs["MT"]._end, 0)
            self.assertEqual(seqs["MT"]._length, 0)

            self.assertEqual(len(seqs["MT"].sequence), 16569)
            self.assertEqual(seqs["MT"].md5, hash)
            self.assertEqual(seqs["MT"].start, 1)
            self.assertEqual(seqs["MT"].end, 16569)
            self.assertEqual(seqs["MT"].length, 16569)

        self.run_reference_sequences_test(f)

    def test_reference_sequence_load_sequence(self):
        if self.downloads_enabled:
            self.run_reference_sequence_load_sequence_test(
                "c68f52674c9fb33aef52dcf399755519")
        else:
            self.run_reference_sequence_load_sequence_test(
                "d432324413a21aa9247321c56c300ad3")

    def test_reference_sequence_generic_load_sequence(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)
            self.assertEqual(seq.ID, "1")
            self.assertEqual(seq.chrom, "1")
            self.assertEqual(seq.path, dest)
            np.testing.assert_array_equal(
                seq.sequence,
                np.array(
                    bytearray(
                        "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN",
                        encoding="utf-8",
                        errors="strict",
                    ),
                    dtype=np.uint8,
                ),
            )
            self.assertListEqual(list("AGGCCGGAC"),
                                 list(map(chr, seq.sequence[100:109])))
            self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6")
            self.assertEqual(seq.start, 1)
            self.assertEqual(seq.end, 117)
            self.assertEqual(seq.length, 117)

    def test_get_opensnp_datadump_filenames(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            filenames = self.resource.get_opensnp_datadump_filenames()

            self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"])

            self.resource._resources_dir = "resources"

    def test_load_opensnp_datadump_file(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            snps1 = SNPs(
                self.resource.load_opensnp_datadump_file("generic1.csv"))
            snps2 = SNPs(
                self.resource.load_opensnp_datadump_file("generic2.csv"))

            pd.testing.assert_frame_equal(snps1.snps,
                                          self.generic_snps(),
                                          check_exact=True)
            pd.testing.assert_frame_equal(snps2.snps,
                                          self.generic_snps(),
                                          check_exact=True)

            self.resource._resources_dir = "resources"
コード例 #2
0
class TestResources(BaseSNPsTestCase):
    def setUp(self):
        self.resource = Resources(resources_dir="resources")
        self.del_output_dir_helper()

    def test_get_assembly_mapping_data(self):
        assembly_mapping_data = self.resource.get_assembly_mapping_data(
            "NCBI36", "GRCh37")
        assert len(assembly_mapping_data) == 25

    def test_get_gsa_resources(self):
        gsa_resources = self.resource.get_gsa_resources()
        assert len(gsa_resources["rsid_map"]) == 618541
        assert len(gsa_resources["chrpos_map"]) == 665609

    def test_get_all_resources(self):
        resources = self.resource.get_all_resources()

        for k, v in resources.items():
            if not v:
                assert False
        assert True

    def test_download_example_datasets(self):
        paths = self.resource.download_example_datasets()

        for path in paths:
            if not path or not os.path.exists(path):
                warnings.warn("Example dataset(s) not currently available")
                return

        assert True

    def test_get_paths_reference_sequences_invalid_assembly(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="36")
        assert not assembly
        assert not chroms
        assert not urls
        assert not paths

    def test_create_reference_sequences_NCBI36(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="NCBI36", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='NCBI36', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "NCBI36"
        assert seqs["MT"].build == "B36"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_GRCh37(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh37", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_GRCh38(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh38", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh38', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh38"
        assert seqs["MT"].build == "B38"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_invalid_path(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh38", chroms=["MT"])
        paths[0] = ""
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 0

    def test_get_reference_sequences(self):
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_get_all_reference_sequences(self):
        seqs = self.resource.get_all_reference_sequences(chroms=["MT"])
        assert len(seqs) == 3
        assert len(seqs["NCBI36"]) == 1
        assert (
            seqs["NCBI36"]["MT"].path ==
            "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert len(seqs["GRCh37"]) == 1
        assert (
            seqs["GRCh37"]["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert len(seqs["GRCh38"]) == 1
        assert (
            seqs["GRCh38"]["MT"].path ==
            "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )

    def test_get_reference_sequences_invalid_assembly(self):
        seqs = self.resource.get_reference_sequences(assembly="36")
        assert len(seqs) == 0

    def test_get_reference_sequences_chrom_not_available(self):
        self.resource.get_reference_sequences(chroms=["MT"])
        del self.resource._reference_sequences["GRCh37"]["MT"]
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_reference_sequence_load_sequence(self):
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs["MT"].sequence) == 16569
        assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519"
        assert seqs["MT"].start == 1
        assert seqs["MT"].end == 16569
        assert seqs["MT"].length == 16569

        seqs["MT"].clear()
        assert seqs["MT"]._sequence.size == 0
        assert seqs["MT"]._md5 == ""
        assert seqs["MT"]._start == 0
        assert seqs["MT"]._end == 0
        assert seqs["MT"]._length == 0

        assert len(seqs["MT"].sequence) == 16569
        assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519"
        assert seqs["MT"].start == 1
        assert seqs["MT"].end == 16569
        assert seqs["MT"].length == 16569

    def test_reference_sequence_generic_load_sequence(self):
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")
        assert seq.ID == "1"
        assert seq.chrom == "1"
        assert seq.path == "tests/input/generic.fa.gz"
        np.testing.assert_array_equal(
            seq.sequence,
            np.array(
                bytearray(
                    "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN",
                    encoding="utf-8",
                    errors="strict",
                ),
                dtype=np.uint8,
            ),
        )
        assert list("AGGCCGGAC") == list(map(chr, seq.sequence[100:109]))
        assert seq.md5 == "6ac6176535ad0e38aba2d05d786c39b6"
        assert seq.start == 1
        assert seq.end == 117
        assert seq.length == 117

    def test_get_opensnp_datadump_filenames(self):
        # temporarily set resources dir to tests
        self.resource._resources_dir = "tests/resources"

        # write test openSNP datadump zip
        with atomic_write("tests/resources/opensnp_datadump.current.zip",
                          mode="wb",
                          overwrite=True) as f:
            with zipfile.ZipFile(f, "w") as f_zip:
                f_zip.write("tests/input/generic.csv", arcname="generic1.csv")
                f_zip.write("tests/input/generic.csv", arcname="generic2.csv")

        filenames = self.resource.get_opensnp_datadump_filenames()

        assert filenames == ["generic1.csv", "generic2.csv"]

        self.resource._resources_dir = "resources"