def test_load_opensnp_datadump_file(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") snps1 = SNPs( self.resource.load_opensnp_datadump_file("generic1.csv")) snps2 = SNPs( self.resource.load_opensnp_datadump_file("generic2.csv")) pd.testing.assert_frame_equal(snps1.snps, self.generic_snps(), check_exact=True) pd.testing.assert_frame_equal(snps2.snps, self.generic_snps(), check_exact=True) self.resource._resources_dir = "resources"
def test_save_snps_vcf_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # setup resource to use test FASTA reference sequence r = Resources() r._reference_sequences["GRCh37"] = {} with open("tests/input/generic.fa", "rb") as f_in: with atomic_write("tests/input/generic.fa.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz") r._reference_sequences["GRCh37"]["1"] = seq # save phased data to VCF assert os.path.relpath( s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf" # read saved VCF s = SNPs("output/vcf_GRCh37.vcf") assert s.phased pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
def test_remap_invalid_assembly(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(-1) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 0) self.assertEqual(len(chromosomes_not_remapped), 2)
def f(): snps = SNPs("tests/input/generic.csv") self.assertEqual( os.path.relpath(snps.save_snps(sep=",")), f"output{os.sep}generic_GRCh37.csv", ) self.run_parsing_tests("output/generic_GRCh37.csv", "generic")
def f(): s = SNPs("tests/input/generic.csv") snps = self.generic_snps() snps.drop("rs5", inplace=True) pd.testing.assert_frame_equal(s.not_null_snps(), snps, check_exact=True)
def test__lookup_build_with_snp_pos_None(self): snps = SNPs() snps._snps = self.create_snp_df(rsid=["rs3094315"], chrom=["1"], pos=[1], genotype=["AA"]) self.assertFalse(snps.detect_build())
def test_save_snps_vcf_discrepant_pos(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
def test_merge_non_existent_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{}, {"merged": True}])
def test_merge_invalid_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{"merged": True}, {}])
def test_merge_list(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/GRCh37.csv")]) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assertEqual(s.source, "generic, generic") self.assertListEqual(s._source, ["generic", "generic"]) self.assert_results( results, [ { "merged": True }, { "merged": True, "common_rsids": pd.Index( [ "rs3094315", "rs2500347", "rsIndelTest", "rs11928389" ], name="rsid", ), }, ], )
def test_save_snps_csv_filename(self): snps = SNPs("tests/input/generic.csv") self.assertEqual( os.path.relpath(snps.save("generic.csv", sep=",")), f"output{os.sep}generic.csv", ) self.run_parsing_tests("output/generic.csv", "generic")
def test_save_snps_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # save phased data to TSV self.assertEqual(os.path.relpath(s.save()), "output/vcf_GRCh37.txt") # read saved TSV self.run_parsing_tests_vcf("output/vcf_GRCh37.txt", phased=True)
def setUp(self): self.snps_GRCh38 = SNPs("tests/input/GRCh38.csv") self.snps = SNPs("tests/input/chromosomes.csv") self.snps_only_detect_source = SNPs("tests/input/chromosomes.csv", only_detect_source=True) self.snps_none = SNPs(None) with open("tests/input/chromosomes.csv", "r") as f: self.snps_buffer = SNPs(f.read().encode("utf-8")) with atomic_write("tests/input/chromosomes.csv.zip", mode="wb", overwrite=True) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/chromosomes.csv", arcname="chromosomes.csv") with open("tests/input/chromosomes.csv.zip", "rb") as f: data = f.read() self.snps_buffer_zip = SNPs(data) os.remove("tests/input/chromosomes.csv.zip") with open("tests/input/chromosomes.csv", "rb") as f_in: with atomic_write("tests/input/chromosomes.csv.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) with open("tests/input/chromosomes.csv.gz", "rb") as f: data = f.read() self.snps_buffer_gz = SNPs(data) os.remove("tests/input/chromosomes.csv.gz")
def test_save_snps_vcf_false_positive_build(self): with tempfile.TemporaryDirectory() as tmpdir1: snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(snps.save(vcf=True), output) s = "" with open(output, "r") as f: for line in f.readlines(): if "snps v" in line: s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n' else: s += line with open(output, "w") as f: f.write(s) self.run_parsing_tests_vcf(output)
def test_remap_snps_invalid_assembly(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(-1) assert s.build == 37 assert s.assembly == "GRCh37" assert len(chromosomes_remapped) == 0 assert len(chromosomes_not_remapped) == 2
def test_remap_37_to_37(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(37) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 0) self.assertEqual(len(chromosomes_not_remapped), 2) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
def test_remap_snps_37_to_36(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(36) assert s.build == 36 assert s.assembly == "NCBI36" assert len(chromosomes_remapped) == 2 assert len(chromosomes_not_remapped) == 0 pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36())
def f(): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(36) self.assertEqual(s.build, 36) self.assertEqual(s.assembly, "NCBI36") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)
def f(): s = SNPs("tests/input/NCBI36.csv", parallelize=True) chromosomes_remapped, chromosomes_not_remapped = s.remap(37) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
def test_remap_snps_37_to_37(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(37) assert s.build == 37 assert s.assembly == "GRCh37" assert len(chromosomes_remapped) == 0 assert len(chromosomes_not_remapped) == 2 pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37())
def test_remap_snps_36_to_37_multiprocessing(self): s = SNPs("tests/input/NCBI36.csv", parallelize=True) chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(37) assert s.build == 37 assert s.assembly == "GRCh37" assert len(chromosomes_remapped) == 2 assert len(chromosomes_not_remapped) == 0 pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37())
def test_save_snps_tsv_filename(self): with tempfile.TemporaryDirectory() as tmpdir: snps = SNPs("tests/input/generic.tsv", output_dir=tmpdir) dest = os.path.join(tmpdir, "generic.tsv") self.assertEqual( snps.save("generic.tsv", sep="\t"), dest, ) self.run_parsing_tests(dest, "generic")
def test_save_source(self): s = SNPs("tests/input/GRCh38.csv") self.assertEqual(os.path.relpath(s.save()), f"output{os.sep}generic_GRCh38.txt") snps = SNPs("output/generic_GRCh38.txt") self.assertEqual(snps.build, 38) self.assertTrue(snps.build_detected) self.assertEqual(snps.source, "generic") self.assertListEqual(snps._source, ["generic"]) pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38(), check_exact=True)
def test_save_snps_phased(self): with tempfile.TemporaryDirectory() as tmpdir: # read phased data s = SNPs("tests/input/testvcf_phased.vcf", output_dir=tmpdir) dest = os.path.join(tmpdir, "vcf_GRCh37.txt") # save phased data to TSV self.assertEqual(s.save(), dest) # read saved TSV self.run_parsing_tests_vcf(dest, phased=True)
def test_save_snps_csv_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # save phased data to CSV assert os.path.relpath(s.save_snps()) == "output/vcf_GRCh37.csv" # read saved CSV s = SNPs("output/vcf_GRCh37.csv") assert s.phased pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
def f(): with tempfile.TemporaryDirectory() as tmpdir: snps = SNPs("tests/input/generic.csv", output_dir=tmpdir) dest = os.path.join(tmpdir, "generic_GRCh37.csv") self.assertEqual( snps.save_snps(sep=","), dest, ) self.run_parsing_tests(dest, "generic")
def test_merge_exceed_discrepant_genotypes_threshold(self): s1 = SNPs("tests/input/generic.csv") s2 = SNPs("tests/input/generic.csv") s2._snps.loc["rs1", "genotype"] = "CC" results = s1.merge([s2], discrepant_genotypes_threshold=0) self.assertEqual(len(s1.discrepant_merge_positions), 0) self.assertEqual(len(s1.discrepant_merge_genotypes), 0) self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0) pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) self.assert_results(results, [{}])
def test_heterozygous_snps(self): s = SNPs("tests/input/generic.csv") pd.testing.assert_frame_equal( s.heterozygous_snps(), self.create_snp_df( rsid=["rs6", "rs7", "rs8"], chrom=["1", "1", "1"], pos=[106, 107, 108], genotype=["GC", "TC", "AT"], ), )
def f(): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False) df = s.discrepant_snps self.assertEqual(len(df), 4) pd.testing.assert_index_equal( df.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, )
def test_homozygous_chrom(self): s = SNPs("tests/input/generic.csv") pd.testing.assert_frame_equal( s.homozygous("1"), self.create_snp_df( rsid=["rs1", "rs2", "rs3", "rs4"], chrom=["1", "1", "1", "1"], pos=[101, 102, 103, 104], genotype=["AA", "CC", "GG", "TT"], ), check_exact=True, )