Ejemplo n.º 1
0
    def test_load_opensnp_datadump_file(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            snps1 = SNPs(
                self.resource.load_opensnp_datadump_file("generic1.csv"))
            snps2 = SNPs(
                self.resource.load_opensnp_datadump_file("generic2.csv"))

            pd.testing.assert_frame_equal(snps1.snps,
                                          self.generic_snps(),
                                          check_exact=True)
            pd.testing.assert_frame_equal(snps2.snps,
                                          self.generic_snps(),
                                          check_exact=True)

            self.resource._resources_dir = "resources"
Ejemplo n.º 2
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")

        r._reference_sequences["GRCh37"]["1"] = seq

        # save phased data to VCF
        assert os.path.relpath(
            s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf"
        # read saved VCF
        s = SNPs("output/vcf_GRCh37.vcf")
        assert s.phased
        pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
Ejemplo n.º 3
0
 def test_remap_invalid_assembly(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(-1)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 0)
     self.assertEqual(len(chromosomes_not_remapped), 2)
Ejemplo n.º 4
0
 def f():
     snps = SNPs("tests/input/generic.csv")
     self.assertEqual(
         os.path.relpath(snps.save_snps(sep=",")),
         f"output{os.sep}generic_GRCh37.csv",
     )
     self.run_parsing_tests("output/generic_GRCh37.csv", "generic")
Ejemplo n.º 5
0
 def f():
     s = SNPs("tests/input/generic.csv")
     snps = self.generic_snps()
     snps.drop("rs5", inplace=True)
     pd.testing.assert_frame_equal(s.not_null_snps(),
                                   snps,
                                   check_exact=True)
Ejemplo n.º 6
0
 def test__lookup_build_with_snp_pos_None(self):
     snps = SNPs()
     snps._snps = self.create_snp_df(rsid=["rs3094315"],
                                     chrom=["1"],
                                     pos=[1],
                                     genotype=["AA"])
     self.assertFalse(snps.detect_build())
Ejemplo n.º 7
0
    def test_save_snps_vcf_discrepant_pos(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # create discrepant SNPs by setting positions outside reference sequence
            s._snps.loc["rs1", "pos"] = 0
            s._snps.loc["rs17", "pos"] = 118

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        pd.testing.assert_frame_equal(
            s.discrepant_vcf_position,
            self.create_snp_df(
                rsid=["rs1", "rs17"],
                chrom=["1", "1"],
                pos=[0, 118],
                genotype=["AA", np.nan],
            ),
            check_exact=True,
        )

        expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
Ejemplo n.º 8
0
 def test_merge_non_existent_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{}, {"merged": True}])
Ejemplo n.º 9
0
 def test_merge_invalid_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{"merged": True}, {}])
Ejemplo n.º 10
0
 def test_merge_list(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"),
          SNPs("tests/input/GRCh37.csv")])
     pd.testing.assert_frame_equal(s.snps,
                                   self.snps_GRCh37(),
                                   check_exact=True)
     self.assertEqual(s.source, "generic, generic")
     self.assertListEqual(s._source, ["generic", "generic"])
     self.assert_results(
         results,
         [
             {
                 "merged": True
             },
             {
                 "merged":
                 True,
                 "common_rsids":
                 pd.Index(
                     [
                         "rs3094315", "rs2500347", "rsIndelTest",
                         "rs11928389"
                     ],
                     name="rsid",
                 ),
             },
         ],
     )
Ejemplo n.º 11
0
 def test_save_snps_csv_filename(self):
     snps = SNPs("tests/input/generic.csv")
     self.assertEqual(
         os.path.relpath(snps.save("generic.csv", sep=",")),
         f"output{os.sep}generic.csv",
     )
     self.run_parsing_tests("output/generic.csv", "generic")
Ejemplo n.º 12
0
 def test_save_snps_phased(self):
     # read phased data
     s = SNPs("tests/input/testvcf_phased.vcf")
     # save phased data to TSV
     self.assertEqual(os.path.relpath(s.save()), "output/vcf_GRCh37.txt")
     # read saved TSV
     self.run_parsing_tests_vcf("output/vcf_GRCh37.txt", phased=True)
Ejemplo n.º 13
0
    def setUp(self):
        self.snps_GRCh38 = SNPs("tests/input/GRCh38.csv")
        self.snps = SNPs("tests/input/chromosomes.csv")
        self.snps_only_detect_source = SNPs("tests/input/chromosomes.csv",
                                            only_detect_source=True)
        self.snps_none = SNPs(None)

        with open("tests/input/chromosomes.csv", "r") as f:
            self.snps_buffer = SNPs(f.read().encode("utf-8"))

        with atomic_write("tests/input/chromosomes.csv.zip",
                          mode="wb",
                          overwrite=True) as f:
            with zipfile.ZipFile(f, "w") as f_zip:
                f_zip.write("tests/input/chromosomes.csv",
                            arcname="chromosomes.csv")

        with open("tests/input/chromosomes.csv.zip", "rb") as f:
            data = f.read()
            self.snps_buffer_zip = SNPs(data)
        os.remove("tests/input/chromosomes.csv.zip")

        with open("tests/input/chromosomes.csv", "rb") as f_in:
            with atomic_write("tests/input/chromosomes.csv.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        with open("tests/input/chromosomes.csv.gz", "rb") as f:
            data = f.read()
            self.snps_buffer_gz = SNPs(data)
        os.remove("tests/input/chromosomes.csv.gz")
Ejemplo n.º 14
0
    def test_save_snps_vcf_false_positive_build(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(snps.save(vcf=True), output)

                s = ""
                with open(output, "r") as f:
                    for line in f.readlines():
                        if "snps v" in line:
                            s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n'
                        else:
                            s += line

                with open(output, "w") as f:
                    f.write(s)

            self.run_parsing_tests_vcf(output)
Ejemplo n.º 15
0
 def test_remap_snps_invalid_assembly(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(-1)
     assert s.build == 37
     assert s.assembly == "GRCh37"
     assert len(chromosomes_remapped) == 0
     assert len(chromosomes_not_remapped) == 2
Ejemplo n.º 16
0
 def test_remap_37_to_37(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(37)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 0)
     self.assertEqual(len(chromosomes_not_remapped), 2)
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
Ejemplo n.º 17
0
 def test_remap_snps_37_to_36(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(36)
     assert s.build == 36
     assert s.assembly == "NCBI36"
     assert len(chromosomes_remapped) == 2
     assert len(chromosomes_not_remapped) == 0
     pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36())
Ejemplo n.º 18
0
 def f():
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(36)
     self.assertEqual(s.build, 36)
     self.assertEqual(s.assembly, "NCBI36")
     self.assertEqual(len(chromosomes_remapped), 2)
     self.assertEqual(len(chromosomes_not_remapped), 0)
     pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)
Ejemplo n.º 19
0
 def f():
     s = SNPs("tests/input/NCBI36.csv", parallelize=True)
     chromosomes_remapped, chromosomes_not_remapped = s.remap(37)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 2)
     self.assertEqual(len(chromosomes_not_remapped), 0)
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
Ejemplo n.º 20
0
 def test_remap_snps_37_to_37(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(37)
     assert s.build == 37
     assert s.assembly == "GRCh37"
     assert len(chromosomes_remapped) == 0
     assert len(chromosomes_not_remapped) == 2
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37())
Ejemplo n.º 21
0
 def test_remap_snps_36_to_37_multiprocessing(self):
     s = SNPs("tests/input/NCBI36.csv", parallelize=True)
     chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(37)
     assert s.build == 37
     assert s.assembly == "GRCh37"
     assert len(chromosomes_remapped) == 2
     assert len(chromosomes_not_remapped) == 0
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37())
Ejemplo n.º 22
0
 def test_save_snps_tsv_filename(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         snps = SNPs("tests/input/generic.tsv", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "generic.tsv")
         self.assertEqual(
             snps.save("generic.tsv", sep="\t"), dest,
         )
         self.run_parsing_tests(dest, "generic")
Ejemplo n.º 23
0
 def test_save_source(self):
     s = SNPs("tests/input/GRCh38.csv")
     self.assertEqual(os.path.relpath(s.save()), f"output{os.sep}generic_GRCh38.txt")
     snps = SNPs("output/generic_GRCh38.txt")
     self.assertEqual(snps.build, 38)
     self.assertTrue(snps.build_detected)
     self.assertEqual(snps.source, "generic")
     self.assertListEqual(snps._source, ["generic"])
     pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38(), check_exact=True)
Ejemplo n.º 24
0
 def test_save_snps_phased(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         # read phased data
         s = SNPs("tests/input/testvcf_phased.vcf", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "vcf_GRCh37.txt")
         # save phased data to TSV
         self.assertEqual(s.save(), dest)
         # read saved TSV
         self.run_parsing_tests_vcf(dest, phased=True)
Ejemplo n.º 25
0
 def test_save_snps_csv_phased(self):
     # read phased data
     s = SNPs("tests/input/testvcf_phased.vcf")
     # save phased data to CSV
     assert os.path.relpath(s.save_snps()) == "output/vcf_GRCh37.csv"
     # read saved CSV
     s = SNPs("output/vcf_GRCh37.csv")
     assert s.phased
     pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
Ejemplo n.º 26
0
 def f():
     with tempfile.TemporaryDirectory() as tmpdir:
         snps = SNPs("tests/input/generic.csv", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "generic_GRCh37.csv")
         self.assertEqual(
             snps.save_snps(sep=","),
             dest,
         )
         self.run_parsing_tests(dest, "generic")
Ejemplo n.º 27
0
    def test_merge_exceed_discrepant_genotypes_threshold(self):
        s1 = SNPs("tests/input/generic.csv")
        s2 = SNPs("tests/input/generic.csv")
        s2._snps.loc["rs1", "genotype"] = "CC"

        results = s1.merge([s2], discrepant_genotypes_threshold=0)
        self.assertEqual(len(s1.discrepant_merge_positions), 0)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 0)
        self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0)
        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
        self.assert_results(results, [{}])
Ejemplo n.º 28
0
 def test_heterozygous_snps(self):
     s = SNPs("tests/input/generic.csv")
     pd.testing.assert_frame_equal(
         s.heterozygous_snps(),
         self.create_snp_df(
             rsid=["rs6", "rs7", "rs8"],
             chrom=["1", "1", "1"],
             pos=[106, 107, 108],
             genotype=["GC", "TC", "AT"],
         ),
     )
Ejemplo n.º 29
0
 def f():
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False)
     df = s.discrepant_snps
     self.assertEqual(len(df), 4)
     pd.testing.assert_index_equal(
         df.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
Ejemplo n.º 30
0
 def test_homozygous_chrom(self):
     s = SNPs("tests/input/generic.csv")
     pd.testing.assert_frame_equal(
         s.homozygous("1"),
         self.create_snp_df(
             rsid=["rs1", "rs2", "rs3", "rs4"],
             chrom=["1", "1", "1", "1"],
             pos=[101, 102, 103, 104],
             genotype=["AA", "CC", "GG", "TT"],
         ),
         check_exact=True,
     )