Esempio n. 1
0
def test_read_var_length_filters(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.Dataset(uri, mode='w')
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ['varLenFilter.vcf.gz']
    ]
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode='r')
    df = ds.read(['pos_start', 'filters'])

    expected_df = pd.DataFrame({
        'pos_start':
        pd.Series([
            12141, 12546, 13354, 13375, 13396, 13414, 13452, 13520, 13545,
            17319, 17480
        ],
                  dtype=np.int32),
        'filters':
        pd.Series(
            map(lambda lst: np.array(lst, dtype=np.object),
                [['PASS'], ['PASS'], ['ANEUPLOID', 'LowQual'], ['PASS'],
                 ['PASS'], ['ANEUPLOID', 'LOWQ', 'LowQual'], ['PASS'],
                 ['PASS'], ['PASS'], ['LowQual'], ['PASS']]))
    })

    _check_dfs(expected_df, df)
Esempio n. 2
0
def test_read_multiple_alleles(tmp_path):
    uri = os.path.join(tmp_path, "dataset")
    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small3.bcf", "small.bcf"]]
    ds.create_dataset()
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode="r")
    df = ds.read(
        attrs=["sample_name", "pos_start", "alleles", "id", "filters"],
        regions=["1:70100-1300000"],
    )
    expected_df = pd.DataFrame(
        {
            "sample_name": pd.Series(["HG00280", "HG00280"]),
            "pos_start": pd.Series([866511, 1289367], dtype=np.int32),
            "alleles": pd.Series(
                map(
                    lambda lst: np.array(lst, dtype=np.object),
                    [["T", "CCCCTCCCT", "C", "CCCCTCCCTCCCT", "CCCCT"], ["CTG", "C"]],
                )
            ),
            "id": pd.Series([".", "rs1497816"]),
            "filters": pd.Series(
                map(
                    lambda lst: np.array(lst, dtype=np.object),
                    [["LowQual"], ["LowQual"]],
                )
            ),
        }
    ).sort_values(ignore_index=True, by=["sample_name", "pos_start"])
    _check_dfs(
        expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"])
    )
Esempio n. 3
0
def test_sample_and_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG01762').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 0
Esempio n. 4
0
def test_read_multiple_alleles(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.Dataset(uri, mode='w')
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf']
    ]
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode='r')
    df = ds.read(
        attrs=['sample_name', 'pos_start', 'alleles', 'id', 'filters'],
        regions=['1:70100-1300000'])
    expected_df = pd.DataFrame({
        'sample_name':
        pd.Series(['HG00280', 'HG00280']),
        'pos_start':
        pd.Series([866511, 1289367], dtype=np.int32),
        'alleles':
        pd.Series(
            map(lambda lst: np.array(lst, dtype=np.object),
                [['T', 'CCCCTCCCT', 'C', 'CCCCTCCCTCCCT', 'CCCCT'],
                 ['CTG', 'C']])),
        'id':
        pd.Series(['.', 'rs1497816']),
        'filters':
        pd.Series(
            map(lambda lst: np.array(lst, dtype=np.object),
                [['LowQual'], ['LowQual']]))
    })
    _check_dfs(expected_df, df)
Esempio n. 5
0
def test_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 4

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2

    # Too many partitions still produces results
    cfg = tiledbvcf.ReadConfig(region_partition=(1, 3))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(region_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
Esempio n. 6
0
def test_sample_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                 regions=["1:12000-18000"])
    assert len(df) == 11
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                 regions=["1:12000-18000"])
    assert len(df) == 3
    assert (df.sample_name == "HG01762").all()

    # Error: too many partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    with pytest.raises(RuntimeError):
        df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                     regions=["1:12000-18000"])

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
Esempio n. 7
0
def test_tbb_threads_config():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=3"])
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=4"])
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
Esempio n. 8
0
def test_incremental_ingest(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.Dataset(uri, mode='w')
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small.bcf')])
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small2.bcf')])

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, mode='r')
    assert ds.count() == 14
    assert ds.count(regions=['1:12700-13400']) == 6
    assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
Esempio n. 9
0
def test_incremental_ingest(tmp_path):
    uri = os.path.join(tmp_path, "dataset")
    ds = tiledbvcf.Dataset(uri, mode="w")
    ds.create_dataset()
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, "small.bcf")])
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, "small2.bcf")])

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, mode="r")
    assert ds.count() == 14
    assert ds.count(regions=["1:12700-13400"]) == 6
    assert ds.count(samples=["HG00280"], regions=["1:12700-13400"]) == 4
Esempio n. 10
0
def test_read_var_length_filters(tmp_path):
    uri = os.path.join(tmp_path, "dataset")
    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["varLenFilter.vcf.gz"]
    ]
    ds.create_dataset()
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode="r")
    df = ds.read(["pos_start", "filters"])

    expected_df = pd.DataFrame({
        "pos_start":
        pd.Series(
            [
                12141,
                12546,
                13354,
                13375,
                13396,
                13414,
                13452,
                13520,
                13545,
                17319,
                17480,
            ],
            dtype=np.int32,
        ),
        "filters":
        pd.Series(
            map(
                lambda lst: np.array(lst, dtype=np.object),
                [
                    ["PASS"],
                    ["PASS"],
                    ["ANEUPLOID", "LowQual"],
                    ["PASS"],
                    ["PASS"],
                    ["ANEUPLOID", "LOWQ", "LowQual"],
                    ["PASS"],
                    ["PASS"],
                    ["PASS"],
                    ["LowQual"],
                    ["PASS"],
                ],
            )),
    }).sort_values(ignore_index=True, by=["pos_start"])

    _check_dfs(expected_df, df.sort_values(ignore_index=True,
                                           by=["pos_start"]))
Esempio n. 11
0
def test_read_config():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig()
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    cfg = tiledbvcf.ReadConfig(
        memory_budget_mb=512,
        region_partition=(0, 3),
        tiledb_config=['sm.tile_cache_size=0', 'sm.num_reader_threads=1'])
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    with pytest.raises(TypeError):
        cfg = tiledbvcf.ReadConfig(abc=123)
Esempio n. 12
0
def test_basic_ingest(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, "dataset")
    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small.bcf", "small2.bcf"]]
    ds.create_dataset()
    ds.ingest_samples(samples)

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, mode="r")
    assert ds.count() == 14
    assert ds.count(regions=["1:12700-13400"]) == 6
    assert ds.count(samples=["HG00280"], regions=["1:12700-13400"]) == 4
Esempio n. 13
0
def test_read_null_attrs(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.Dataset(uri, mode='w')
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf']
    ]
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode='r')
    df = ds.read(attrs=[
        'sample_name', 'pos_start', 'pos_end', 'info_BaseQRankSum', 'info_DP',
        'fmt_DP', 'fmt_MIN_DP'
    ],
                 regions=['1:12700-13400', '1:69500-69800'])
    expected_df = pd.DataFrame({
        'sample_name':
        pd.Series([
            'HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280',
            'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280'
        ]),
        'pos_start':
        pd.Series([
            12546, 12546, 13354, 13354, 13375, 13396, 69371, 69511, 69512,
            69761, 69762, 69771
        ],
                  dtype=np.int32),
        'pos_end':
        pd.Series([
            12771, 12771, 13374, 13389, 13395, 13413, 69510, 69511, 69760,
            69761, 69770, 69834
        ],
                  dtype=np.int32),
        'info_BaseQRankSum':
        pd.Series([
            None, None, None, None, None, None, None,
            np.array([-0.787], dtype=np.float32), None,
            np.array([1.97], dtype=np.float32), None, None
        ]),
        'info_DP':
        pd.Series([
            None, None, None, None, None, None, None,
            np.array([89], dtype=np.int32), None,
            np.array([24], dtype=np.int32), None, None
        ]),
        'fmt_DP':
        pd.Series([0, 0, 15, 64, 6, 2, 180, 88, 97, 24, 23, 21],
                  dtype=np.int32),
        'fmt_MIN_DP':
        pd.Series([0, 0, 14, 30, 3, 1, 20, None, 24, None, 23, 19])
    })
    _check_dfs(expected_df, df)
Esempio n. 14
0
def test_basic_ingest(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.Dataset(uri, mode='w')
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf']
    ]
    ds.ingest_samples(samples)

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, mode='r')
    assert ds.count() == 14
    assert ds.count(regions=['1:12700-13400']) == 6
    assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
Esempio n. 15
0
def test_read_write_mode_exceptions():
    ds = tiledbvcf.Dataset(
        os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples'))
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf']
    ]
    with pytest.raises(Exception):
        ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR,
                                        'arrays/v3/ingested_2samples'),
                           mode='w')
    with pytest.raises(Exception):
        ds.count()
Esempio n. 16
0
def test_read_config():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig()
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    cfg = tiledbvcf.ReadConfig(
        memory_budget_mb=512,
        region_partition=(0, 3),
        tiledb_config=["sm.tile_cache_size=0", "sm.compute_concurrency_level=1"],
    )
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    with pytest.raises(TypeError):
        cfg = tiledbvcf.ReadConfig(abc=123)
Esempio n. 17
0
def test_ingest_merging_separate(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, "dataset_merging_separate")
    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"]
    ]
    ds.create_dataset()
    ds.ingest_samples(samples, contigs_to_keep_separate=["chr1"])

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, mode="r")
    assert ds.count() == 246
    assert ds.count(regions=["chrX:9032893-9032893"]) == 1
Esempio n. 18
0
def test_read_write_mode_exceptions():
    ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples"))
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small.bcf", "small2.bcf"]]

    with pytest.raises(Exception):
        ds.create_dataset()

    with pytest.raises(Exception):
        ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(
        os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples"), mode="w"
    )
    with pytest.raises(Exception):
        ds.count()
Esempio n. 19
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    df = test_ds.read(attrs=["pos_end"], regions=["1:12700-13400"])
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([12771, 12771], dtype=np.int32)}),
        df,
    )

    df = test_ds.continue_read()
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([13374, 13389], dtype=np.int32)}),
        df,
    )

    df = test_ds.continue_read()
    assert test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([13395, 13413], dtype=np.int32)}),
        df,
    )
Esempio n. 20
0
def test_large_export_correctness():
    uri = "s3://tiledb-inc-demo-data/tiledbvcf-arrays/v4/vcf-samples-20"

    ds = tiledbvcf.Dataset(uri, mode="r", verbose=True)
    df = ds.read(
        attrs=[
            "sample_name",
            "contig",
            "pos_start",
            "pos_end",
            "query_bed_start",
            "query_bed_end",
        ],
        samples=["v2-DjrIAzkP", "v2-YMaDHIoW", "v2-usVwJUmo", "v2-ZVudhauk"],
        bed_file=os.path.join(
            TESTS_INPUT_DIR, "E001_15_coreMarks_dense_filtered.bed.gz"
        ),
    )

    # total number of exported records
    assert df.shape[0] == 1172081

    # number of unique exported records
    record_index = ["sample_name", "contig", "pos_start"]
    assert df[record_index].drop_duplicates().shape[0] == 1168430
Esempio n. 21
0
def test_read_limit():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(limit=3)
    ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)
    df = ds.read(
        attrs=['sample_name', 'pos_start', 'pos_end', 'fmt_DP', 'fmt_PL'],
        regions=['1:12100-13360', '1:13500-17350'])
    assert len(df) == 3
Esempio n. 22
0
def test_read_limit():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(limit=3)
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end", "fmt_DP", "fmt_PL"],
        regions=["1:12100-13360", "1:13500-17350"],
    )
    assert len(df) == 3
Esempio n. 23
0
def test_sample_and_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG01762").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 0
Esempio n. 24
0
def test_ingest_disable_merging(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, "dataset_disable_merging")

    cfg = tiledbvcf.ReadConfig(memory_budget_mb=1024)
    attrs = ["sample_name", "contig", "pos_start", "pos_end"]

    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"]
    ]
    ds.create_dataset()
    ds.ingest_samples(samples, contig_fragment_merging=False)

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=False)
    df = ds.read(attrs=attrs)
    assert ds.count() == 246
    assert ds.count(regions=["chrX:9032893-9032893"]) == 1

    # Create the dataset
    uri = os.path.join(tmp_path, "dataset_merging_separate")
    ds2 = tiledbvcf.Dataset(uri, mode="w", verbose=True)
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"]
    ]
    ds2.create_dataset()
    ds2.ingest_samples(samples, contigs_to_keep_separate=["chr1"])

    # Open it back in read mode and check some queries
    ds2 = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=True)
    df2 = ds2.read(attrs=attrs)
    print(df.equals(df2))
    assert df.equals(df2)

    assert ds.count() == 246
    assert ds.count(regions=["chrX:9032893-9032893"]) == 1
Esempio n. 25
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(
            ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280',
             'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280',
             'HG00280', 'HG00280', 'HG00280', 'HG00280']),
            'pos_start': pd.Series(
                [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396,
                 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32),
            'pos_end': pd.Series(
                [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413,
                 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Sample partitions (we have to sort to check the result)
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # Both
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10, sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # No partitioning
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'])
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Subset of partitions (limit_partitions)
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10, sample_partitions=2,
                                limit_partitions=2)
    assert dask_df.npartitions == 2
Esempio n. 26
0
def test_map_incomplete():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(['HG00280', 'HG01762']),
         'pos_start': pd.Series([12141, 12141], dtype=np.int32),
         'pos_end': pd.Series([12277, 12277], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000],
                               attrs=['sample_name', 'pos_start', 'pos_end'],
                               region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)
Esempio n. 27
0
def test_incomplete_read_generator():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    overall_df = None
    for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']):
        if overall_df is None:
            overall_df = df
        else:
            overall_df = overall_df.append(df, ignore_index=True)

    assert len(overall_df) == 6
    _check_dfs(
        pd.DataFrame.from_dict({
            'pos_end':
            np.array([12771, 12771, 13374, 13389, 13395, 13413],
                     dtype=np.int32)
        }), overall_df)
Esempio n. 28
0
def test_map_incomplete():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    expected_df = pd.DataFrame({
        "sample_name":
        pd.Series(["HG00280", "HG01762"]),
        "pos_start":
        pd.Series([12141, 12141], dtype=np.int32),
        "pos_end":
        pd.Series([12277, 12277], dtype=np.int32),
    })

    # Region partitions
    dask_df = test_ds.map_dask(  # pylint:disable=no-member
        lambda df: df[df.pos_start * 2 < 25000],
        attrs=["sample_name", "pos_start", "pos_end"],
        region_partitions=10,
    )  # pylint:disable=no-member
    df = dask_df.compute()
    _check_dfs(expected_df, df)
Esempio n. 29
0
def test_read_null_attrs(tmp_path):
    uri = os.path.join(tmp_path, "dataset")
    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["small3.bcf", "small.bcf"]
    ]
    ds.create_dataset()
    ds.ingest_samples(samples)

    ds = tiledbvcf.Dataset(uri, mode="r")
    df = ds.read(
        attrs=[
            "sample_name",
            "pos_start",
            "pos_end",
            "info_BaseQRankSum",
            "info_DP",
            "fmt_DP",
            "fmt_MIN_DP",
        ],
        regions=["1:12700-13400", "1:69500-69800"],
    )
    expected_df = pd.DataFrame({
        "sample_name":
        pd.Series([
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG01762",
            "HG01762",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
        ]),
        "pos_start":
        pd.Series(
            [
                12546,
                13354,
                13375,
                13396,
                12546,
                13354,
                69371,
                69511,
                69512,
                69761,
                69762,
                69771,
            ],
            dtype=np.int32,
        ),
        "pos_end":
        pd.Series(
            [
                12771,
                13374,
                13395,
                13413,
                12771,
                13389,
                69510,
                69511,
                69760,
                69761,
                69770,
                69834,
            ],
            dtype=np.int32,
        ),
        "info_BaseQRankSum":
        pd.Series([
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            np.array([-0.787], dtype=np.float32),
            None,
            np.array([1.97], dtype=np.float32),
            None,
            None,
        ]),
        "info_DP":
        pd.Series([
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            np.array([89], dtype=np.int32),
            None,
            np.array([24], dtype=np.int32),
            None,
            None,
        ]),
        "fmt_DP":
        pd.Series([0, 15, 6, 2, 0, 64, 180, 88, 97, 24, 23, 21],
                  dtype=np.int32),
        "fmt_MIN_DP":
        pd.Series([0, 14, 3, 1, 0, 30, 20, None, 24, None, 23, 19]),
    }).sort_values(ignore_index=True, by=["sample_name", "pos_start"])
    _check_dfs(
        expected_df,
        df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]))
Esempio n. 30
0
def test_ds_attrs():
    return tiledbvcf.Dataset(
        os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples_GT_DP_PL"))