Example #1
0
def test_multiprocess_table(ms, nprocs):
    import time
    import threading
    import dask.threaded as dt

    # Don't fork threads
    # https://rachelbythebay.com/w/2011/06/07/forked/
    # Close and cleanup default dask threadpools
    with dt.pools_lock:
        if dt.default_pool is not None:
            dt.default_pool.close()
            dt.default_pool = None

        for thread in list(dt.pools.keys()):
            for p in dt.pools.pop(thread).values():
                p.close()

    # No TableProxies or Executors (with ThreadPools) live
    assert_liveness(0, 0)

    # Wait for other threads to die
    time.sleep(0.1)

    # Only main thread is alive
    assert len(threading.enumerate()) == 1

    from multiprocessing import Pool
    pool = Pool(nprocs)

    try:
        args = [tuple((ms, i)) for i in range(nprocs)]
        assert all(pool.map(_proc_map_fn, args))
    finally:
        pool.close()
Example #2
0
def test_dataset_add_column(ms, dtype):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # Create the dask array
    bitflag = da.zeros_like(ds.DATA.data, dtype=dtype)
    # Assign keyword attribute
    col_kw = {
        "BITFLAG": {
            'FLAGSETS': 'legacy,cubical',
            'FLAGSET_legacy': 1,
            'FLAGSET_cubical': 2
        }
    }
    # Assign variable onto the dataset
    nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag))
    writes = write_datasets(ms,
                            nds, ["BITFLAG"],
                            descriptor='ratt_ms',
                            column_keywords=col_kw)

    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        bf = T.getcol("BITFLAG")
        assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG']
        assert bf.dtype == dtype
Example #3
0
def test_embedding_table_proxy_in_taql(ms, reverse):
    """ Test using a TableProxy to create a TAQL TableProxy """
    proxy = TableProxy(pt.table, ms, ack=False, readonly=True)
    query = "SELECT UNIQUE ANTENNA1 FROM $1"
    taql_proxy = TableProxy(taql_factory, query, tables=[proxy])
    assert_array_equal(taql_proxy.getcol("ANTENNA1").result(), [0, 1, 2])

    # TAQL and original table
    assert_liveness(2, 1)

    if reverse:
        del proxy
        # TAQL still references original table
        assert_liveness(2, 1)

        # Remove TAQL now results in everything clearing up
        del taql_proxy
        assert_liveness(0, 0)
    else:
        # Removing TAQL should leave original table
        del taql_proxy
        assert_liveness(1, 1)

        # Removing proxy removes the last
        del proxy
        assert_liveness(0, 0)
Example #4
0
def test_taql_proxy_pickling(ms):
    """ Test taql pickling """
    proxy = TableProxy(pt.taql, f"SELECT UNIQUE ANTENNA1 FROM '{ms}'")
    proxy2 = pickle.loads(pickle.dumps(proxy))

    assert_liveness(1, 1)

    assert proxy is proxy2
    assert tokenize(proxy) == tokenize(proxy2)

    del proxy, proxy2
    assert_liveness(0, 0)
Example #5
0
def test_multiprocess_table(ms, nprocs):
    # Check here so that we don't fork threads
    # https://rachelbythebay.com/w/2011/06/07/forked/
    assert_liveness(0, 0)

    from multiprocessing import Pool
    pool = Pool(nprocs)

    try:
        args = [tuple((ms, i)) for i in range(nprocs)]
        assert all(pool.map(_proc_map_fn, args))
    finally:
        pool.close()
Example #6
0
def test_table_proxy_pickling(ms):
    """ Test table pickling """
    proxy = TableProxy(pt.table, ms, ack=False, readonly=False)
    proxy2 = pickle.loads(pickle.dumps(proxy))

    assert_liveness(1, 1)

    # Same object and tokens
    assert proxy is proxy2
    assert tokenize(proxy) == tokenize(proxy2)

    del proxy, proxy2

    assert_liveness(0, 0)
Example #7
0
def test_row_grouping(spw_table, spw_chan_freqs, chunks):
    """ Test grouping on single rows """
    datasets = read_datasets(spw_table, [], ["__row__"], [], chunks=chunks)

    assert_liveness(2, 1)

    assert len(datasets) == len(spw_chan_freqs)

    for i, chan_freq in enumerate(spw_chan_freqs):
        assert_array_equal(datasets[i].CHAN_FREQ.data[0], chan_freq)
        assert_array_equal(datasets[i].NUM_CHAN.data[0], chan_freq.shape[0])

    del datasets
    assert_liveness(0, 0)
Example #8
0
def test_ordering_multiple_groups(ms, group_cols, index_cols):
    group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols)
    assert_liveness(2, 1)
    orders = group_row_ordering(group_taql, group_cols, index_cols, [{
        'row': 2
    }])
    assert_liveness(2, 1)
    first_rows = group_taql.getcol("__firstrow__").result()
    assert_liveness(2, 1)

    assert len(first_rows) == len(orders) == 6

    assert_array_equal(first_rows, [0, 1, 3, 4, 7, 8])

    rowid_arrays = tuple(o[0] for o in orders)
    rowids = dask.compute(rowid_arrays)[0]

    assert_array_equal(rowids[0], [2, 0])
    assert_array_equal(rowids[1], [1])
    assert_array_equal(rowids[2], [5, 3])
    assert_array_equal(rowids[3], [6, 4])
    assert_array_equal(rowids[4], [9, 7])
    assert_array_equal(rowids[5], [8])

    del first_rows, orders, rowid_arrays, group_taql
    assert_liveness(0, 0)
Example #9
0
def test_row_ordering_multiple_groups(ms, group_cols, index_cols, chunks):
    group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols)
    assert_liveness(2, 1)
    orders = group_row_ordering(group_taql, group_cols, index_cols, chunks)
    assert_liveness(2, 1)
    first_rows = group_taql.getcol("__firstrow__").result()
    assert_liveness(2, 1)

    # We get two groups out
    assert len(orders) == len(first_rows) == 2
    assert_array_equal(first_rows, [0, 7])

    rowid_arrays = tuple(o[0] for o in orders)
    rowids = dask.compute(rowid_arrays)[0]

    # Check the two resulting groups

    # Normalise chunks to match that of the output array
    row_chunks = chunks[0]['row']
    expected_chunks = da.core.normalize_chunks(row_chunks, (7, ))
    assert_array_equal(rowids[0], [6, 5, 4, 3, 2, 1, 0])
    assert rowid_arrays[0].chunks == expected_chunks

    # If chunks only supplied for the first group, re-use it's chunking
    row_chunks = chunks[0]['row'] if len(chunks) == 1 else chunks[1]['row']
    expected_chunks = da.core.normalize_chunks(row_chunks, (3, ))
    assert_array_equal(rowids[1], [9, 8, 7])
    assert rowid_arrays[1].chunks == expected_chunks

    del first_rows, orders, rowid_arrays, group_taql
    assert_liveness(0, 0)
Example #10
0
def test_table_proxy(ms):
    """ Base table proxy test """
    tp = TableProxy(pt.table, ms, ack=False, readonly=False)
    tq = TableProxy(pt.taql, f"SELECT UNIQUE ANTENNA1 FROM '{ms}'")

    assert_liveness(2, 1)

    assert tp.nrows().result() == 10
    assert tq.nrows().result() == 3

    # Different tokens
    assert tokenize(tp) != tokenize(tq)

    del tp, tq

    assert_liveness(0, 0)
Example #11
0
def test_dataset_assign(ms):
    """ Test dataset assignment """
    datasets = read_datasets(ms, [], [], [])

    assert len(datasets) == 1
    ds = datasets[0]

    # Assign on an existing column is easier because we can
    # infer the dimension schema from it
    nds = ds.assign(TIME=(ds.TIME.dims, ds.TIME.data + 1))
    assert ds.DATA.data is nds.DATA.data
    assert ds.TIME.data is not nds.TIME.data
    assert_array_equal(nds.TIME.data, ds.TIME.data + 1)

    # We have to explicitly supply a dimension schema
    nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3))
    assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data)

    dims = ds.dims
    chunks = ds.chunks

    if have_xarray:
        match = "'row': length 9 on 'ANTENNA4'"
    else:
        match = ("Existing dimension size 9 for dimension 'row' "
                 "is inconsistent with same dimension 10 of array ANTENNA4")

    with pytest.raises(ValueError, match=match):
        array = da.zeros(dims['row'] - 1, chunks['row'])
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.dims

    assert chunks['row'] == (10, )

    if have_xarray:
        match = "Object has inconsistent chunks along dimension row."
    else:
        match = r"chunking \(4, 4, 2\) for dim"

    with pytest.raises(ValueError, match=match):
        array = da.zeros(dims['row'], chunks=4)
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.chunks

    del datasets, ds, nds
    assert_liveness(0, 0)
Example #12
0
def test_column_metadata(ms, column, shape, chunks, table_schema, dtype):
    table_proxy = TableProxy(pt.table, ms, readonly=True, ack=False)
    assert_liveness(1, 1)

    try:
        dims = table_schema[column]['dims']
    except KeyError:
        dims = tuple("%s-%d" % (column, i) for i in range(1, len(shape) + 1))

    meta = column_metadata(column, table_proxy, table_schema, dict(chunks))

    assert meta.shape == shape
    assert meta.dims == dims
    assert meta.chunks == [c[1] for c in chunks[:len(meta.shape)]]
    assert meta.dtype == dtype

    del table_proxy
    assert_liveness(0, 0)
Example #13
0
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks):
    """ Test dataset creation """
    datasets = read_datasets(ms,
                             select_cols,
                             group_cols,
                             index_cols,
                             chunks=chunks)
    # (1) Read-only TableProxy
    # (2) Read-only TAQL TableProxy
    assert_liveness(2, 1)

    chans = shapes['chan']
    corrs = shapes['corr']

    # Expected output chunks
    echunks = {
        'chan': normalize_chunks(chunks.get('chan', chans),
                                 shape=(chans, ))[0],
        'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0]
    }

    for ds in datasets:
        compute_dict = {}

        for k, v in ds.data_vars.items():
            compute_dict[k] = v.data
            assert v.dtype == v.data.dtype

        res = dask.compute(compute_dict)[0]

        assert res['DATA'].shape[1:] == (chans, corrs)
        assert 'STATE_ID' in res
        assert 'TIME' in res

        chunks = ds.chunks
        assert chunks["chan"] == echunks['chan']
        assert chunks["corr"] == echunks['corr']

        dims = ds.dims
        dims.pop('row')  # row changes
        assert dims == {"chan": shapes['chan'], "corr": shapes['corr']}

    del ds, datasets, compute_dict, v
    assert_liveness(0, 0)
Example #14
0
def test_dataset_add_string_column(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]
    dims = ds.dims

    name_list = ["BOB"] * dims['row']
    names = np.asarray(name_list, dtype=np.object)
    names = da.from_array(names, chunks=ds.TIME.chunks)

    nds = ds.assign(NAMES=(("row", ), names))

    writes = write_datasets(ms, nds, ["NAMES"])
    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        assert name_list == T.getcol("NAMES")
Example #15
0
def test_proxy_finalization(tmpdir_factory, epochs, iterations):
    """
    Test that we can create many TableProxy objects
    associated with multiple Executors
    in multiple threads, get some data and that
    they, as well as their associated executor are
    correctly finalized
    """

    data_path = tmpdir_factory.mktemp('data')
    ascii_desc = data_path.join('ascii.txt')

    with open(str(ascii_desc), 'w') as f:
        f.write(ASCII_TABLE)

    futures = []

    def _getcol(tp, column):
        return tp.result().getcol(column)

    with cf.ThreadPoolExecutor(8) as pool:
        for e in range(epochs):
            # Iteration
            for i in range(iterations):
                path = data_path.join("CASA-%d-%d.table" % (e, i))

                tab_fut = pool.submit(TableProxy,
                                      pt.tablefromascii,
                                      str(path),
                                      str(ascii_desc),
                                      ack=False,
                                      __executor_key__="epoch-%d" % i)
                data = pool.submit(_getcol, tab_fut, "DATA")
                u = pool.submit(_getcol, tab_fut, "U")
                futures.append(data)
            futures.append(u)

        futures, _ = cf.wait(futures)

    del futures, data, u, tab_fut
    assert_liveness(0, 0)
Example #16
0
def test_proxy_dask_embedding(ms):
    """
    Test that an embedded proxy in the graph stays alive
    and dies at the appropriate times
    """
    def _ant1_factory(ms):
        proxy = TableProxy(pt.table, ms, ack=False, readonly=False)
        nrows = proxy.nrows().result()

        name = 'ant1'
        row_chunk = 2
        layers = {}
        chunks = []

        for c, sr in enumerate(range(0, nrows, row_chunk)):
            er = min(sr + row_chunk, nrows)
            chunk_size = er - sr
            chunks.append(chunk_size)
            layers[(name, c)] = (proxy.getcol, "ANTENNA1", sr, chunk_size)

        # Create array
        graph = HighLevelGraph.from_collections(name, layers, [])
        ant1 = da.Array(graph, name, (tuple(chunks), ), dtype=np.int32)
        # Evaluate futures
        return ant1.map_blocks(lambda f: f.result(), dtype=ant1.dtype)

    ant1 = _ant1_factory(ms)

    # Proxy and executor's are embedded in the graph
    assert_liveness(1, 1)

    a1 = ant1.compute()

    with pt.table(ms, readonly=False, ack=False) as T:
        assert_array_equal(a1, T.getcol("ANTENNA1"))

    # Delete the graph
    del ant1

    # Cache's are now clear
    assert_liveness(0, 0)
Example #17
0
def test_dataset_multidim_string_column(tmp_path, chunks):
    row = sum(chunks['row'])

    name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)]
    np_names = np.array(name_list, dtype=np.object)
    names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1]))

    ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)})
    table_name = str(tmp_path / "test.table")
    writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"])
    dask.compute(writes)

    del writes
    assert_liveness(0, 0)

    datasets = read_datasets(table_name, [], [], [],
                             chunks={'row': chunks['row']})
    assert len(datasets) == 1
    assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names)
    del datasets
    assert_liveness(0, 0)
Example #18
0
def test_dataset_updates(ms, select_cols,
                         group_cols, index_cols,
                         shapes, chunks):
    """ Test dataset writes """

    # Get original STATE_ID and DATA
    with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T:
        original_state_id = T.getcol("STATE_ID")
        original_data = T.getcol("DATA")

    try:
        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)
        assert_liveness(2, 1)

        # Test writes
        writes = []
        states = []
        datas = []

        # Create write operations and execute them
        for i, ds in enumerate(datasets):
            state_var = (("row",), ds.STATE_ID.data + 1)
            data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {})
            states.append(state_var[1])
            datas.append(data_var[1])
            new_ds = ds.assign(STATE_ID=state_var, DATA=data_var)
            writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"]))

        _, states, datas = dask.compute(writes, states, datas)

        # NOTE(sjperkins)
        # Interesting behaviour here. If these objects are not
        # cleared up at this point, attempts to re-open the table below
        # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26
        # Adding auto-locking to the table opening command seems to fix
        # this somehow
        del ds, new_ds, datasets, writes, state_var, data_var
        assert_liveness(0, 0)

        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)

        for i, (ds, state, data) in enumerate(zip(datasets, states, datas)):
            assert_array_equal(ds.STATE_ID.data, state)
            assert_array_equal(ds.DATA.data, data)

        del ds, datasets
        assert_liveness(0, 0)
    finally:
        # Restore original STATE_ID
        with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T:
            state_id = T.getcol("STATE_ID")
            data = T.getcol("DATA")
            T.putcol("STATE_ID", original_state_id)
            T.putcol("DATA", original_data)

    # Compare against expected result
    assert_array_equal(original_state_id + 1, state_id)
    assert_array_equal(original_data + 1, data)
Example #19
0
def test_dataset_assign(ms):
    """ Test dataset assignment """
    datasets = read_datasets(ms, [], [], [])

    assert len(datasets) == 1
    ds = datasets[0]

    # Assign on an existing column is easier because we can
    # infer the dimension schema from it
    nds = ds.assign(TIME=ds.TIME.data + 1)
    assert ds.DATA.data is nds.DATA.data
    assert ds.TIME.data is not nds.TIME.data
    assert_array_equal(nds.TIME.data, ds.TIME.data + 1)

    # This doesn't work for new columns
    with pytest.raises(ValueError, match="Couldn't find existing dimension"):
        ds.assign(ANTENNA3=ds.ANTENNA1.data + 3)

    # We have to explicitly supply a dimension schema
    nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3))
    assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data)

    dims = ds.dims
    chunks = ds.chunks

    with pytest.raises(ValueError, match="size 9 for dimension 'row'"):
        array = da.zeros(dims['row'] - 1, chunks['row'])
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.dims

    assert chunks['row'] == (10, )

    with pytest.raises(ValueError, match=r"chunking \(4, 4, 2\) for dim"):
        array = da.zeros(dims['row'], chunks=4)
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.chunks

    del datasets, ds, nds
    assert_liveness(0, 0)
Example #20
0
def test_row_ordering_no_group(ms, index_cols, chunks):
    order_taql = ordering_taql(table_proxy(ms), index_cols)
    assert_liveness(2, 1)
    orders = row_ordering(order_taql, index_cols, chunks)
    assert_liveness(2, 1)

    # Normalise chunks to match that of the output array
    expected_chunks = da.core.normalize_chunks(chunks['row'], (10, ))

    assert orders[0].chunks == expected_chunks

    rowids = dask.compute(orders[0])[0]
    assert_array_equal(rowids, [9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

    del orders, order_taql
    assert_liveness(0, 0)