def test_merge_chrom(self): s1 = SNPs("tests/input/generic.csv") df = s1.snps.append( self.create_snp_df( rsid=["rs100", "rs101", "rs102", "rs103"], chrom=["Y", "Y", "Y", "Y"], pos=[100, 101, 102, 103], genotype=["A", np.nan, "A", "A"], )) s1._snps = df.copy() s2 = SNPs() s2._build = 37 s2._snps = df.copy() # set values for chrom that will be ignored (that would otherwise result in # identification of discrepant SNPs or updating genotype) s2._snps.loc["rs3", "pos"] = 1003 # discrepant position s2._snps.loc["rs4", "genotype"] = "AA" # discrepant genotype s2._snps.loc["rs5", "genotype"] = "AA" # set values for chrom to be merged s2._snps.loc["rs100", "genotype"] = "T" # discrepant genotype s2._snps.loc["rs101", "genotype"] = "A" s2._snps.loc["rs102", "pos"] = 1002 # discrepant position # set expected values for merge result df.loc["rs100", "genotype"] = np.nan # discrepant genotype sets to np.nan df.loc["rs101", "genotype"] = "A" # updates np.nan results = s1.merge([s2], chrom="Y") pd.testing.assert_frame_equal(s1.snps, df, check_exact=True) self.assert_results( results, [{ "merged": True, "common_rsids": pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"), "discrepant_position_rsids": pd.Index(["rs102"], name="rsid"), "discrepant_genotype_rsids": pd.Index(["rs100"], name="rsid"), }], ) self.assertEqual(len(s1.discrepant_merge_positions), 1) self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
def test_appending_dfs(self): s = SNPs() s._snps = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._duplicate = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._discrepant_XY = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s.merge([s]) df = self.create_snp_df(rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[1, 1], genotype=["AA", "AA"]) pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True) pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True) pd.testing.assert_frame_equal(s.heterozygous_MT, get_empty_snps_dataframe(), check_exact=True) pd.testing.assert_frame_equal(s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True)
def test__lookup_build_with_snp_pos_None(self): snps = SNPs() snps._snps = self.create_snp_df(rsid=["rs3094315"], chrom=["1"], pos=[1], genotype=["AA"]) self.assertFalse(snps.detect_build())
def simulate_snps( self, chrom="1", pos_start=1, pos_max=248140902, pos_step=100, genotype="AA", insert_nulls=True, null_snp_step=101, complement_genotype_one_chrom=False, complement_genotype_two_chroms=False, complement_snp_step=50, ): s = SNPs() s._build = 37 positions = np.arange(pos_start, pos_max, pos_step, dtype=np.uint32) snps = pd.DataFrame( {"chrom": chrom}, index=pd.Index(["rs" + str(x + 1) for x in range(len(positions))], name="rsid"), ) snps["pos"] = positions snps["genotype"] = genotype if insert_nulls: snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan indices = snps.iloc[0::complement_snp_step, :].index if complement_genotype_two_chroms: snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( self.complement_two_chroms) elif complement_genotype_one_chrom: snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( self.complement_one_chrom) s._snps = snps return s
def test_save_snps_vcf_discrepant_pos(self): with tempfile.TemporaryDirectory() as tmpdir1: s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 # esnure this is the right type after manual tweaking s._snps = s._snps.astype({"pos": np.uint32}) self.assertEqual(s.save(vcf=True), output) pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf(output, snps_df=expected)
def test_merging_files_discrepant_snps(self): with tempfile.TemporaryDirectory() as tmpdir: dest1 = os.path.join(tmpdir, "discrepant_snps1.csv") dest2 = os.path.join(tmpdir, "discrepant_snps2.csv") df = pd.read_csv( "tests/input/discrepant_snps.csv", skiprows=1, na_values="--", names=[ "rsid", "chrom", "pos_file1", "pos_file2", "genotype_file1", "genotype_file2", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ], index_col=0, dtype={ "chrom": object, "pos_file1": np.uint32, "pos_file2": np.uint32, "discrepant_position": bool, "discrepant_genotype": bool, "expected_position": np.uint32, }, ) df1 = df[["chrom", "pos_file1", "genotype_file1"]] df2 = df[["chrom", "pos_file2", "genotype_file2"]] df1.to_csv(dest1, na_rep="--", header=["chromosome", "position", "genotype"]) df2.to_csv(dest2, na_rep="--", header=["chromosome", "position", "genotype"]) s = SNPs() s.merge([SNPs(dest1), SNPs(dest2)]) expected = df[[ "chrom", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ]] expected = expected.rename(columns={ "expected_position": "pos", "expected_genotype": "genotype" }) expected_snps = SNPs() expected_snps._snps = expected expected_snps.sort() expected = expected_snps.snps pd.testing.assert_index_equal( s.discrepant_merge_positions.index, expected.loc[expected["discrepant_position"] == True].index, ) pd.testing.assert_index_equal( s.discrepant_merge_genotypes.index, expected.loc[expected["discrepant_genotype"] == True].index, ) pd.testing.assert_series_equal(s.snps["pos"], expected["pos"]) pd.testing.assert_series_equal(s.snps["genotype"], expected["genotype"])
def test__lookup_build_with_snp_pos_None(self): snps = SNPs() snps._snps = self.snps_discrepant_pos() assert not snps.detect_build()
def test_merging_files_discrepant_snps(self): df = pd.read_csv( "tests/input/discrepant_snps.csv", skiprows=1, na_values="--", names=[ "rsid", "chrom", "pos_file1", "pos_file2", "genotype_file1", "genotype_file2", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ], index_col=0, dtype={ "chrom": object, "pos_file1": np.int64, "pos_file2": np.int64, "discrepant_position": bool, "discrepant_genotype": bool, }, ) df1 = df[["chrom", "pos_file1", "genotype_file1"]] df2 = df[["chrom", "pos_file2", "genotype_file2"]] df1.to_csv( "tests/input/discrepant_snps1.csv", na_rep="--", header=["chromosome", "position", "genotype"], ) df2.to_csv( "tests/input/discrepant_snps2.csv", na_rep="--", header=["chromosome", "position", "genotype"], ) sc = SNPsCollection([ "tests/input/discrepant_snps1.csv", "tests/input/discrepant_snps2.csv" ]) expected = df[[ "chrom", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ]] expected = expected.rename(columns={ "expected_position": "pos", "expected_genotype": "genotype" }) expected_snps = SNPs() expected_snps._snps = expected expected_snps.sort_snps() expected = expected_snps.snps pd.testing.assert_index_equal( sc.discrepant_positions.index, expected.loc[expected["discrepant_position"] == True].index, ) pd.testing.assert_index_equal( sc.discrepant_genotypes.index, expected.loc[expected["discrepant_genotype"] == True].index, ) pd.testing.assert_series_equal(sc.snps["pos"], expected["pos"]) pd.testing.assert_series_equal(sc.snps["genotype"], expected["genotype"])