def test_reference_sequence_generic_load_sequence(self): with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) self.assertEqual(seq.ID, "1") self.assertEqual(seq.chrom, "1") self.assertEqual(seq.path, dest) np.testing.assert_array_equal( seq.sequence, np.array( bytearray( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), dtype=np.uint8, ), ) self.assertListEqual(list("AGGCCGGAC"), list(map(chr, seq.sequence[100:109]))) self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6") self.assertEqual(seq.start, 1) self.assertEqual(seq.end, 117) self.assertEqual(seq.length, 117)
def test_save_snps_vcf_discrepant_pos(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
def test_save_snps_vcf_false_positive_build(self): with tempfile.TemporaryDirectory() as tmpdir1: snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(snps.save(vcf=True), output) s = "" with open(output, "r") as f: for line in f.readlines(): if "snps v" in line: s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n' else: s += line with open(output, "w") as f: f.write(s) self.run_parsing_tests_vcf(output)
def test_read_ftdna_concat_gzip_extra_data(self): # https://www.familytreedna.com total_snps1 = 10 total_snps2 = 10 # generate content of first file s1 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" for i in range(0, total_snps1): s1 += '"rs{}","1","{}","AA"\r\n'.format(1 + i, 101 + i) # generate content of second file s2 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" for i in range(0, total_snps2): s2 += '"rs{}","1","{}","AA"\r\n'.format(total_snps1 + 1 + i, total_snps1 + 101 + i) snps_df = self.create_snp_df( rsid=[ "rs{}".format(1 + i) for i in range(0, total_snps1 + total_snps2) ], chrom="1", pos=[101 + i for i in range(0, total_snps1 + total_snps2)], genotype="AA", ) with tempfile.TemporaryDirectory() as tmpdir: file1 = os.path.join(tmpdir, "ftdna_concat_gzip1.csv") file1_gz = "{}.gz".format(file1) file2 = os.path.join(tmpdir, "ftdna_concat_gzip2.csv") file2_gz = "{}.gz".format(file2) path = os.path.join(tmpdir, "ftdna_concat_gzip.csv.gz") # write individual files with open(file1, "w") as f: f.write(s1) with open(file2, "w") as f: f.write(s2) # compress files gzip_file(file1, file1_gz) gzip_file(file2, file2_gz) # concatenate gzips with open(file1_gz, "rb") as f: data = f.read() with open(file2_gz, "rb") as f: data += f.read() # add extra data data += b"extra data" # write file with concatenated gzips and extra data with open(path, "wb") as f: f.write(data) self.make_parsing_assertions(self.parse_file(path), "FTDNA", False, 37, False, snps_df) self.make_parsing_assertions(self.parse_bytes(path), "FTDNA", False, 37, False, snps_df)
def _setup_gsa_test(resources_dir): # reset resource if already loaded r = Resources() r._resources_dir = resources_dir r._gsa_resources = {} gzip_file( "tests/resources/gsa_rsid_map.txt", os.path.join(resources_dir, "gsa_rsid_map.txt.gz"), ) gzip_file( "tests/resources/gsa_chrpos_map.txt", os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"), )
def run_parsing_tests(self, file, source, phased=False, build=37, build_detected=False, snps_df=None): self.make_parsing_assertions(self.parse_file(file), source, phased, build, build_detected, snps_df) self.make_parsing_assertions(self.parse_bytes(file), source, phased, build, build_detected, snps_df) with tempfile.TemporaryDirectory() as tmpdir: base = os.path.basename(file) dest = os.path.join(tmpdir, f"{base}.gz") gzip_file(file, dest) self.make_parsing_assertions(self.parse_file(dest), source, phased, build, build_detected, snps_df) self.make_parsing_assertions(self.parse_bytes(dest), source, phased, build, build_detected, snps_df) # remove .gz extension shutil.move(dest, dest[:-3]) self.make_parsing_assertions( self.parse_file(dest[:-3]), source, phased, build, build_detected, snps_df, ) dest = os.path.join(tmpdir, f"{base}.zip") zip_file(file, dest, base) self.make_parsing_assertions(self.parse_file(dest), source, phased, build, build_detected, snps_df) self.make_parsing_assertions(self.parse_bytes(dest), source, phased, build, build_detected, snps_df) # remove .zip extension shutil.move(dest, dest[:-4]) self.make_parsing_assertions( self.parse_file(dest[:-4]), source, phased, build, build_detected, snps_df, )
def test_save_snps_vcf(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf")
def test_save_snps_vcf(self): with tempfile.TemporaryDirectory() as tmpdir1: s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(s.save(vcf=True), output) self.run_parsing_tests_vcf(output)
def test_save_snps_vcf_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # setup resource to use test FASTA reference sequence r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # save phased data to VCF self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") # read saved VCF self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", phased=True)
def test_save_snps_vcf_discrepant_pos(self): with tempfile.TemporaryDirectory() as tmpdir1: s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 # esnure this is the right type after manual tweaking s._snps = s._snps.astype({"pos": np.uint32}) self.assertEqual(s.save(vcf=True), output) pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf(output, snps_df=expected)
def _setup_gsa_test(resources_dir): # reset resource if already loaded r = Resources() r._resources_dir = resources_dir r._init_resource_attributes() gzip_file( "tests/resources/gsa_rsid_map.txt", os.path.join(resources_dir, "gsa_rsid_map.txt.gz"), ) gzip_file( "tests/resources/gsa_chrpos_map.txt", os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"), ) gzip_file( "tests/resources/dbsnp_151_37_reverse.txt", os.path.join(resources_dir, "dbsnp_151_37_reverse.txt.gz"), )
def run_parsing_tests_vcf( self, file, source="vcf", phased=False, unannotated=False, rsids=(), build=37, build_detected=False, snps_df=None, ): # https://samtools.github.io/hts-specs/VCFv4.2.pdf # this tests for homozygous snps, heterozygous snps, multiallelic snps, # phased snps, and snps with missing rsID self.make_parsing_assertions_vcf( self.parse_file(file, rsids), source, phased, unannotated, rsids, build, build_detected, snps_df, ) self.make_parsing_assertions_vcf( self.parse_bytes(file, rsids), source, phased, unannotated, rsids, build, build_detected, snps_df, ) with tempfile.TemporaryDirectory() as tmpdir: base = os.path.basename(file) dest = os.path.join(tmpdir, f"{base}.gz") gzip_file(file, dest) self.make_parsing_assertions_vcf( self.parse_file(dest, rsids), source, phased, unannotated, rsids, build, build_detected, snps_df, ) self.make_parsing_assertions_vcf( self.parse_bytes(dest, rsids), source, phased, unannotated, rsids, build, build_detected, snps_df, ) # remove .gz extension shutil.move(dest, dest[:-3]) self.make_parsing_assertions_vcf( self.parse_file(dest[:-3], rsids), source, phased, unannotated, rsids, build, build_detected, snps_df, )