def test_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(region_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 4 cfg = tiledbvcf.ReadConfig(region_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 # Too many partitions still produces results cfg = tiledbvcf.ReadConfig(region_partition=(1, 3)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(region_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_sample_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) assert len(df) == 11 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) assert len(df) == 3 assert (df.sample_name == 'HG01762').all() # Error: too many partitions cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) with pytest.raises(RuntimeError): df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_sample_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) assert len(df) == 11 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) assert len(df) == 3 assert (df.sample_name == "HG01762").all() # Error: too many partitions cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) with pytest.raises(RuntimeError): df = ds.read(attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-18000"]) # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(region_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 4 cfg = tiledbvcf.ReadConfig(region_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 # Too many partitions still produces results cfg = tiledbvcf.ReadConfig(region_partition=(1, 3)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(region_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_tbb_threads_config(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=3"]) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=4"]) with pytest.raises(RuntimeError): ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
def test_tbb_threads_config(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=3']) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=4']) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_read_config(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig() ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) cfg = tiledbvcf.ReadConfig(memory_budget_mb=512, region_partition=(0, 3), tiledb_config=['sm.tile_cache_size=0', 'sm.num_reader_threads=1']) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) with pytest.raises(TypeError): cfg = tiledbvcf.ReadConfig(abc=123)
def test_read_config(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig() ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) cfg = tiledbvcf.ReadConfig( memory_budget_mb=512, region_partition=(0, 3), tiledb_config=["sm.tile_cache_size=0", "sm.compute_concurrency_level=1"], ) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) with pytest.raises(TypeError): cfg = tiledbvcf.ReadConfig(abc=123)
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = test_ds.read(attrs=["pos_end"], regions=["1:12700-13400"]) assert not test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([12771, 12771], dtype=np.int32)}), df, ) df = test_ds.continue_read() assert not test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([13374, 13389], dtype=np.int32)}), df, ) df = test_ds.continue_read() assert test_ds.read_completed() assert len(df) == 2 _check_dfs( pd.DataFrame.from_dict( {"pos_end": np.array([13395, 13413], dtype=np.int32)}), df, )
def test_read_limit(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(limit=3) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end', 'fmt_DP', 'fmt_PL'], regions=['1:12100-13360', '1:13500-17350']) assert len(df) == 3
def test_read_limit(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(limit=3) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end", "fmt_DP", "fmt_PL"], regions=["1:12100-13360", "1:13500-17350"], ) assert len(df) == 3
def test_sample_and_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG01762").all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(0, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 2 assert (df.sample_name == "HG00280").all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(1, 2)) ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) df = ds.read( attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12000-13000", "1:17000-18000"], ) assert len(df) == 0
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series( ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280']), 'pos_start': pd.Series( [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32), 'pos_end': pd.Series( [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413, 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)}) # Region partitions dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df) # Sample partitions (we have to sort to check the result) dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # Both dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10, sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # No partitioning dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end']) df = dask_df.compute() _check_dfs(expected_df, df) # Subset of partitions (limit_partitions) dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10, sample_partitions=2, limit_partitions=2) assert dask_df.npartitions == 2
def test_incomplete_read_generator(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) overall_df = None for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']): if overall_df is None: overall_df = df else: overall_df = overall_df.append(df, ignore_index=True) assert len(overall_df) == 6 _check_dfs(pd.DataFrame.from_dict( {'pos_end': np.array([12771, 12771, 13374, 13389, 13395, 13413], dtype=np.int32)}), overall_df)
def test_map_incomplete(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series(['HG00280', 'HG01762']), 'pos_start': pd.Series([12141, 12141], dtype=np.int32), 'pos_end': pd.Series([12277, 12277], dtype=np.int32)}) # Region partitions dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000], attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df)
def test_map_incomplete(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) expected_df = pd.DataFrame( { "sample_name": pd.Series(["HG00280", "HG01762"]), "pos_start": pd.Series([12141, 12141], dtype=np.int32), "pos_end": pd.Series([12277, 12277], dtype=np.int32), } ) # Region partitions dask_df = test_ds.map_dask( lambda df: df[df.pos_start * 2 < 25000], attrs=["sample_name", "pos_start", "pos_end"], region_partitions=10, ) df = dask_df.compute() _check_dfs(expected_df, df)
def test_ingest_disable_merging(tmp_path): # Create the dataset uri = os.path.join(tmp_path, "dataset_disable_merging") cfg = tiledbvcf.ReadConfig(memory_budget_mb=1024) attrs = ["sample_name", "contig", "pos_start", "pos_end"] ds = tiledbvcf.Dataset(uri, mode="w") samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"] ] ds.create_dataset() ds.ingest_samples(samples, contig_fragment_merging=False) # Open it back in read mode and check some queries ds = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=False) df = ds.read(attrs=attrs) assert ds.count() == 246 assert ds.count(regions=["chrX:9032893-9032893"]) == 1 # Create the dataset uri = os.path.join(tmp_path, "dataset_merging_separate") ds2 = tiledbvcf.Dataset(uri, mode="w", verbose=True) samples = [ os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"] ] ds2.create_dataset() ds2.ingest_samples(samples, contigs_to_keep_separate=["chr1"]) # Open it back in read mode and check some queries ds2 = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=True) df2 = ds2.read(attrs=attrs) print(df.equals(df2)) assert df.equals(df2) assert ds.count() == 246 assert ds.count(regions=["chrX:9032893-9032893"]) == 1
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) expected_df = pd.DataFrame({ "sample_name": pd.Series([ "HG00280", "HG01762", "HG00280", "HG01762", "HG00280", "HG01762", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", "HG00280", ]), "pos_start": pd.Series( [ 12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480, ], dtype=np.int32, ), "pos_end": pd.Series( [ 12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413, 13451, 13519, 13544, 13689, 17479, 17486, ], dtype=np.int32, ), }) # Region partitions dask_df = test_ds.read_dask( # pylint:disable=no-member attrs=["sample_name", "pos_start", "pos_end"], region_partitions=10) # pylint:disable=no-member df = dask_df.compute() _check_dfs(expected_df, df) # Sample partitions (we have to sort to check the result) dask_df = test_ds.read_dask(attrs=["sample_name", "pos_start", "pos_end"], sample_partitions=2) # pylint:disable=no-member df = dask_df.compute().sort_values("sample_name").reset_index(drop=True) _check_dfs( expected_df.sort_values("sample_name").reset_index(drop=True), df) # Both dask_df = test_ds.read_dask( attrs=["sample_name", "pos_start", "pos_end"], region_partitions=10, sample_partitions=2, ) # pylint:disable=no-member df = dask_df.compute().sort_values("sample_name").reset_index(drop=True) _check_dfs( expected_df.sort_values("sample_name").reset_index(drop=True), df) # No partitioning dask_df = test_ds.read_dask(attrs=["sample_name", "pos_start", "pos_end"]) # pylint:disable=no-member df = dask_df.compute() _check_dfs(expected_df, df) # Subset of partitions (limit_partitions) dask_df = test_ds.read_dask( attrs=["sample_name", "pos_start", "pos_end"], region_partitions=10, sample_partitions=2, limit_partitions=2, ) # pylint:disable=no-member assert dask_df.npartitions == 2
def test_sample_and_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG01762').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 0