def test_read_var_length_filters(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.Dataset(uri, mode='w') samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ['varLenFilter.vcf.gz'] ] ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode='r') df = ds.read(['pos_start', 'filters']) expected_df = pd.DataFrame({ 'pos_start': pd.Series([ 12141, 12546, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480 ], dtype=np.int32), 'filters': pd.Series( map(lambda lst: np.array(lst, dtype=np.object), [['PASS'], ['PASS'], ['ANEUPLOID', 'LowQual'], ['PASS'], ['PASS'], ['ANEUPLOID', 'LOWQ', 'LowQual'], ['PASS'], ['PASS'], ['PASS'], ['LowQual'], ['PASS']])) }) _check_dfs(expected_df, df)
def test_read_multiple_alleles(tmp_path): uri = os.path.join(tmp_path, "dataset") ds = tiledbvcf.Dataset(uri, mode="w") samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small3.bcf", "small.bcf"]] ds.create_dataset() ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode="r") df = ds.read( attrs=["sample_name", "pos_start", "alleles", "id", "filters"], regions=["1:70100-1300000"], ) expected_df = pd.DataFrame( { "sample_name": pd.Series(["HG00280", "HG00280"]), "pos_start": pd.Series([866511, 1289367], dtype=np.int32), "alleles": pd.Series( map( lambda lst: np.array(lst, dtype=np.object), [["T", "CCCCTCCCT", "C", "CCCCTCCCTCCCT", "CCCCT"], ["CTG", "C"]], ) ), "id": pd.Series([".", "rs1497816"]), "filters": pd.Series( map( lambda lst: np.array(lst, dtype=np.object), [["LowQual"], ["LowQual"]], ) ), } ).sort_values(ignore_index=True, by=["sample_name", "pos_start"]) _check_dfs( expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) )
def test_sample_and_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG01762').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 0
def test_read_multiple_alleles(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.Dataset(uri, mode='w') samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf'] ] ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode='r') df = ds.read( attrs=['sample_name', 'pos_start', 'alleles', 'id', 'filters'], regions=['1:70100-1300000']) expected_df = pd.DataFrame({ 'sample_name': pd.Series(['HG00280', 'HG00280']), 'pos_start': pd.Series([866511, 1289367], dtype=np.int32), 'alleles': pd.Series( map(lambda lst: np.array(lst, dtype=np.object), [['T', 'CCCCTCCCT', 'C', 'CCCCTCCCTCCCT', 'CCCCT'], ['CTG', 'C']])), 'id': pd.Series(['.', 'rs1497816']), 'filters': pd.Series( map(lambda lst: np.array(lst, dtype=np.object), [['LowQual'], ['LowQual']])) }) _check_dfs(expected_df, df)
def test_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(region_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 4 cfg = tiledbvcf.ReadConfig(region_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 # Too many partitions still produces results cfg = tiledbvcf.ReadConfig(region_partition=(1, 3)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(region_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_sample_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) assert len(df) == 11 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) assert len(df) == 3 assert (df.sample_name == "HG01762").all() # Error: too many partitions cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) with pytest.raises(RuntimeError): df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_tbb_threads_config(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=3"]) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=4"]) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_incremental_ingest(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.Dataset(uri, mode='w') ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small.bcf')]) ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small2.bcf')]) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, mode='r') assert ds.count() == 14 assert ds.count(regions=['1:12700-13400']) == 6 assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
def test_incremental_ingest(tmp_path): uri = os.path.join(tmp_path, "dataset") ds = tiledbvcf.Dataset(uri, mode="w") ds.create_dataset() ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, "small.bcf")]) ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, "small2.bcf")]) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, mode="r") assert ds.count() == 14 assert ds.count(regions=["1:12700-13400"]) == 6 assert ds.count(samples=["HG00280"], regions=["1:12700-13400"]) == 4
def test_read_var_length_filters(tmp_path): uri = os.path.join(tmp_path, "dataset") ds = tiledbvcf.Dataset(uri, mode="w") samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["varLenFilter.vcf.gz"] ] ds.create_dataset() ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode="r") df = ds.read(["pos_start", "filters"]) expected_df = pd.DataFrame({ "pos_start": pd.Series( [ 12141, 12546, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480, ], dtype=np.int32, ), "filters": pd.Series( map( lambda lst: np.array(lst, dtype=np.object), [ ["PASS"], ["PASS"], ["ANEUPLOID", "LowQual"], ["PASS"], ["PASS"], ["ANEUPLOID", "LOWQ", "LowQual"], ["PASS"], ["PASS"], ["PASS"], ["LowQual"], ["PASS"], ], )), }).sort_values(ignore_index=True, by=["pos_start"]) _check_dfs(expected_df, df.sort_values(ignore_index=True, by=["pos_start"]))
def test_read_config(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig() ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) cfg = tiledbvcf.ReadConfig( memory_budget_mb=512, region_partition=(0, 3), tiledb_config=['sm.tile_cache_size=0', 'sm.num_reader_threads=1']) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) with pytest.raises(TypeError): cfg = tiledbvcf.ReadConfig(abc=123)
def test_basic_ingest(tmp_path): # Create the dataset uri = os.path.join(tmp_path, "dataset") ds = tiledbvcf.Dataset(uri, mode="w") samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small.bcf", "small2.bcf"]] ds.create_dataset() ds.ingest_samples(samples) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, mode="r") assert ds.count() == 14 assert ds.count(regions=["1:12700-13400"]) == 6 assert ds.count(samples=["HG00280"], regions=["1:12700-13400"]) == 4
def test_read_null_attrs(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.Dataset(uri, mode='w') samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf'] ] ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode='r') df = ds.read(attrs=[ 'sample_name', 'pos_start', 'pos_end', 'info_BaseQRankSum', 'info_DP', 'fmt_DP', 'fmt_MIN_DP' ], regions=['1:12700-13400', '1:69500-69800']) expected_df = pd.DataFrame({ 'sample_name': pd.Series([ 'HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280' ]), 'pos_start': pd.Series([ 12546, 12546, 13354, 13354, 13375, 13396, 69371, 69511, 69512, 69761, 69762, 69771 ], dtype=np.int32), 'pos_end': pd.Series([ 12771, 12771, 13374, 13389, 13395, 13413, 69510, 69511, 69760, 69761, 69770, 69834 ], dtype=np.int32), 'info_BaseQRankSum': pd.Series([ None, None, None, None, None, None, None, np.array([-0.787], dtype=np.float32), None, np.array([1.97], dtype=np.float32), None, None ]), 'info_DP': pd.Series([ None, None, None, None, None, None, None, np.array([89], dtype=np.int32), None, np.array([24], dtype=np.int32), None, None ]), 'fmt_DP': pd.Series([0, 0, 15, 64, 6, 2, 180, 88, 97, 24, 23, 21], dtype=np.int32), 'fmt_MIN_DP': pd.Series([0, 0, 14, 30, 3, 1, 20, None, 24, None, 23, 19]) }) _check_dfs(expected_df, df)
def test_basic_ingest(tmp_path): # Create the dataset uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.Dataset(uri, mode='w') samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf'] ] ds.ingest_samples(samples) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, mode='r') assert ds.count() == 14 assert ds.count(regions=['1:12700-13400']) == 6 assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
def test_read_write_mode_exceptions(): ds = tiledbvcf.Dataset( os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')) samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf'] ] with pytest.raises(Exception): ds.ingest_samples(samples) ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples'), mode='w') with pytest.raises(Exception): ds.count()
def test_read_config(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig() ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) cfg = tiledbvcf.ReadConfig( memory_budget_mb=512, region_partition=(0, 3), tiledb_config=["sm.tile_cache_size=0", "sm.compute_concurrency_level=1"], ) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) with pytest.raises(TypeError): cfg = tiledbvcf.ReadConfig(abc=123)
def test_ingest_merging_separate(tmp_path): # Create the dataset uri = os.path.join(tmp_path, "dataset_merging_separate") ds = tiledbvcf.Dataset(uri, mode="w") samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"] ] ds.create_dataset() ds.ingest_samples(samples, contigs_to_keep_separate=["chr1"]) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, mode="r") assert ds.count() == 246 assert ds.count(regions=["chrX:9032893-9032893"]) == 1
def test_read_write_mode_exceptions(): ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")) samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small.bcf", "small2.bcf"]] with pytest.raises(Exception): ds.create_dataset() with pytest.raises(Exception): ds.ingest_samples(samples) ds = tiledbvcf.Dataset( os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples"), mode="w" ) with pytest.raises(Exception): ds.count()
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = test_ds.read(attrs=["pos_end"], regions=["1:12700-13400"]) assert not test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([12771, 12771], dtype=np.int32)}), df, ) df = test_ds.continue_read() assert not test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([13374, 13389], dtype=np.int32)}), df, ) df = test_ds.continue_read() assert test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([13395, 13413], dtype=np.int32)}), df, )
def test_large_export_correctness(): uri = "s3://tiledb-inc-demo-data/tiledbvcf-arrays/v4/vcf-samples-20" ds = tiledbvcf.Dataset(uri, mode="r", verbose=True) df = ds.read( attrs=[ "sample_name", "contig", "pos_start", "pos_end", "query_bed_start", "query_bed_end", ], samples=["v2-DjrIAzkP", "v2-YMaDHIoW", "v2-usVwJUmo", "v2-ZVudhauk"], bed_file=os.path.join( TESTS_INPUT_DIR, "E001_15_coreMarks_dense_filtered.bed.gz" ), ) # total number of exported records assert df.shape[0] == 1172081 # number of unique exported records record_index = ["sample_name", "contig", "pos_start"] assert df[record_index].drop_duplicates().shape[0] == 1168430
def test_read_limit(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(limit=3) ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) df = ds.read( attrs=['sample_name', 'pos_start', 'pos_end', 'fmt_DP', 'fmt_PL'], regions=['1:12100-13360', '1:13500-17350']) assert len(df) == 3
def test_read_limit(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(limit=3) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end", "fmt_DP", "fmt_PL"], regions=["1:12100-13360", "1:13500-17350"], ) assert len(df) == 3
def test_sample_and_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG01762").all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 0
def test_ingest_disable_merging(tmp_path): # Create the dataset uri = os.path.join(tmp_path, "dataset_disable_merging") cfg = tiledbvcf.ReadConfig(memory_budget_mb=1024) attrs = ["sample_name", "contig", "pos_start", "pos_end"] ds = tiledbvcf.Dataset(uri, mode="w") samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"] ] ds.create_dataset() ds.ingest_samples(samples, contig_fragment_merging=False) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=False) df = ds.read(attrs=attrs) assert ds.count() == 246 assert ds.count(regions=["chrX:9032893-9032893"]) == 1 # Create the dataset uri = os.path.join(tmp_path, "dataset_merging_separate") ds2 = tiledbvcf.Dataset(uri, mode="w", verbose=True) samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"] ] ds2.create_dataset() ds2.ingest_samples(samples, contigs_to_keep_separate=["chr1"]) # Open it back in read mode and check some queries ds2 = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=True) df2 = ds2.read(attrs=attrs) print(df.equals(df2)) assert df.equals(df2) assert ds.count() == 246 assert ds.count(regions=["chrX:9032893-9032893"]) == 1
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series( ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280']), 'pos_start': pd.Series( [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32), 'pos_end': pd.Series( [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413, 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)}) # Region partitions dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df) # Sample partitions (we have to sort to check the result) dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # Both dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10, sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # No partitioning dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end']) df = dask_df.compute() _check_dfs(expected_df, df) # Subset of partitions (limit_partitions) dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10, sample_partitions=2, limit_partitions=2) assert dask_df.npartitions == 2
def test_map_incomplete(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series(['HG00280', 'HG01762']), 'pos_start': pd.Series([12141, 12141], dtype=np.int32), 'pos_end': pd.Series([12277, 12277], dtype=np.int32)}) # Region partitions dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000], attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df)
def test_incomplete_read_generator(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) overall_df = None for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']): if overall_df is None: overall_df = df else: overall_df = overall_df.append(df, ignore_index=True) assert len(overall_df) == 6 _check_dfs( pd.DataFrame.from_dict({ 'pos_end': np.array([12771, 12771, 13374, 13389, 13395, 13413], dtype=np.int32) }), overall_df)
def test_map_incomplete(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) expected_df = pd.DataFrame({ "sample_name": pd.Series(["HG00280", "HG01762"]), "pos_start": pd.Series([12141, 12141], dtype=np.int32), "pos_end": pd.Series([12277, 12277], dtype=np.int32), }) # Region partitions dask_df = test_ds.map_dask( # pylint:disable=no-member lambda df: df[df.pos_start * 2 < 25000], attrs=["sample_name", "pos_start", "pos_end"], region_partitions=10, ) # pylint:disable=no-member df = dask_df.compute() _check_dfs(expected_df, df)
def test_read_null_attrs(tmp_path): uri = os.path.join(tmp_path, "dataset") ds = tiledbvcf.Dataset(uri, mode="w") samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["small3.bcf", "small.bcf"] ] ds.create_dataset() ds.ingest_samples(samples) ds = tiledbvcf.Dataset(uri, mode="r") df = ds.read( attrs=[ "sample_name", "pos_start", "pos_end", "info_BaseQRankSum", "info_DP", "fmt_DP", "fmt_MIN_DP", ], regions=["1:12700-13400", "1:69500-69800"], ) expected_df = pd.DataFrame({ "sample_name": pd.Series([ "HG00280", "HG00280", "HG00280", "HG00280", "HG01762", "HG01762", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", ]), "pos_start": pd.Series( [ 12546, 13354, 13375, 13396, 12546, 13354, 69371, 69511, 69512, 69761, 69762, 69771, ], dtype=np.int32, ), "pos_end": pd.Series( [ 12771, 13374, 13395, 13413, 12771, 13389, 69510, 69511, 69760, 69761, 69770, 69834, ], dtype=np.int32, ), "info_BaseQRankSum": pd.Series([ None, None, None, None, None, None, None, np.array([-0.787], dtype=np.float32), None, np.array([1.97], dtype=np.float32), None, None, ]), "info_DP": pd.Series([ None, None, None, None, None, None, None, np.array([89], dtype=np.int32), None, np.array([24], dtype=np.int32), None, None, ]), "fmt_DP": pd.Series([0, 15, 6, 2, 0, 64, 180, 88, 97, 24, 23, 21], dtype=np.int32), "fmt_MIN_DP": pd.Series([0, 14, 3, 1, 0, 30, 20, None, 24, None, 23, 19]), }).sort_values(ignore_index=True, by=["sample_name", "pos_start"]) _check_dfs( expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]))
def test_ds_attrs(): return tiledbvcf.Dataset( os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples_GT_DP_PL"))