def test_dataset_updates(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset writes """ # Get original STATE_ID and DATA with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T: original_state_id = T.getcol("STATE_ID") original_data = T.getcol("DATA") try: datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) assert_liveness(2, 1) # Test writes writes = [] states = [] datas = [] # Create write operations and execute them for i, ds in enumerate(datasets): state_var = (("row",), ds.STATE_ID.data + 1) data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {}) states.append(state_var[1]) datas.append(data_var[1]) new_ds = ds.assign(STATE_ID=state_var, DATA=data_var) writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"])) _, states, datas = dask.compute(writes, states, datas) # NOTE(sjperkins) # Interesting behaviour here. If these objects are not # cleared up at this point, attempts to re-open the table below # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26 # Adding auto-locking to the table opening command seems to fix # this somehow del ds, new_ds, datasets, writes, state_var, data_var assert_liveness(0, 0) datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) for i, (ds, state, data) in enumerate(zip(datasets, states, datas)): assert_array_equal(ds.STATE_ID.data, state) assert_array_equal(ds.DATA.data, data) del ds, datasets assert_liveness(0, 0) finally: # Restore original STATE_ID with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T: state_id = T.getcol("STATE_ID") data = T.getcol("DATA") T.putcol("STATE_ID", original_state_id) T.putcol("DATA", original_data) # Compare against expected result assert_array_equal(original_state_id + 1, state_id) assert_array_equal(original_data + 1, data)
def test_dataset_add_column(ms, dtype): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Create the dask array bitflag = da.zeros_like(ds.DATA.data, dtype=dtype) # Assign keyword attribute col_kw = { "BITFLAG": { 'FLAGSETS': 'legacy,cubical', 'FLAGSET_legacy': 1, 'FLAGSET_cubical': 2 } } # Assign variable onto the dataset nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag)) writes = write_datasets(ms, nds, ["BITFLAG"], descriptor='ratt_ms', column_keywords=col_kw) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: bf = T.getcol("BITFLAG") assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG'] assert bf.dtype == dtype
def test_dataset_schema(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] row, chan, corr = (ds.dims[d] for d in ("row", "chan", "corr")) cdata = np.random.random((row, chan, corr)).astype(np.complex64) ds = ds.assign(**{"CORRECTED_DATA": (("row", "chan", "corr"), cdata)}) ds = ds.assign_coords( **{ "row": ("row", np.arange(row)), "chan": ("chan", np.arange(chan)), "corr": ("corr", np.arange(corr)), }) # We can shift between objects and dict representation ds_schema = DatasetSchema.from_dataset(ds) assert DatasetSchema.from_dict(ds_schema.to_dict()) == ds_schema # And the dict repr can go through JSON, although # we don't compare because JSON converts tuples to lists serialized = json.dumps(ds_schema.to_dict()).encode() DatasetSchema.from_dict(json.loads(serialized.decode()))
def test_dataset_numpy(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] row, chan, corr = (ds.dims[d] for d in ("row", "chan", "corr")) cdata = np.random.random((row, chan, corr)).astype(np.complex64) row_coord = np.arange(row) chan_coord = np.arange(chan) corr_coord = np.arange(corr) ds = ds.assign(**{"CORRECTED_DATA": (("row", "chan", "corr"), cdata)}) ds = ds.assign_coords( **{ "row": ("row", row_coord), "chan": ("chan", chan_coord), "corr": ("corr", corr_coord), }) assert isinstance(ds.CORRECTED_DATA.data, np.ndarray) assert_array_equal(ds.CORRECTED_DATA.values, cdata) assert isinstance(ds.row.data, np.ndarray) assert_array_equal(ds.row.values, row_coord) assert isinstance(ds.chan.data, np.ndarray) assert_array_equal(ds.chan.values, chan_coord) assert isinstance(ds.corr.data, np.ndarray) assert_array_equal(ds.corr.values, corr_coord) nds = ds.compute() for k, v in nds.data_vars.items(): assert_array_equal(v.data, getattr(ds, k).data) for k, v in nds.coords.items(): assert_array_equal(v.data, getattr(ds, k).data) nds, = dask.compute(ds) for k, v in nds.data_vars.items(): assert_array_equal(v.data, getattr(ds, k).data) for k, v in nds.coords.items(): assert_array_equal(v.data, getattr(ds, k).data) nds, = dask.persist(ds) for k, v in nds.data_vars.items(): assert_array_equal(v.data, getattr(ds, k).data) for k, v in nds.coords.items(): assert_array_equal(v.data, getattr(ds, k).data)
def test_row_grouping(spw_table, spw_chan_freqs, chunks): """ Test grouping on single rows """ datasets = read_datasets(spw_table, [], ["__row__"], [], chunks=chunks) assert_liveness(2, 1) assert len(datasets) == len(spw_chan_freqs) for i, chan_freq in enumerate(spw_chan_freqs): assert_array_equal(datasets[i].CHAN_FREQ.data[0], chan_freq) assert_array_equal(datasets[i].NUM_CHAN.data[0], chan_freq.shape[0]) del datasets assert_liveness(0, 0)
def test_stress(big_ms, iterations, chunks): datasets = read_datasets(big_ms, ["TIME", "DATA"], ["FIELD_ID", "DATA_DESC_ID"], [], chunks=chunks) assert len(datasets) == 1 ds = datasets[0] writes = [] for i in range(iterations): nds = ds.assign(TIME=(("row", ), ds.TIME.data + i), DATA=(("row", "chan", "corr"), ds.DATA.data + i)) writes.append(write_datasets(big_ms, nds, ["TIME", "DATA"])) dask.compute(writes)
def test_dataset_computes_and_values(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # All dask arrays for k, v in ds.data_vars.items(): assert isinstance(v.data, da.Array) nds = ds.compute() # Now we have numpy arrays that match original data for k, v in nds.data_vars.items(): assert isinstance(v.data, np.ndarray) assert_array_equal(v.data, ds.data_vars[k].data) assert_array_equal(v.values, ds.data_vars[k].data)
def test_dataset_assign(ms): """ Test dataset assignment """ datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Assign on an existing column is easier because we can # infer the dimension schema from it nds = ds.assign(TIME=(ds.TIME.dims, ds.TIME.data + 1)) assert ds.DATA.data is nds.DATA.data assert ds.TIME.data is not nds.TIME.data assert_array_equal(nds.TIME.data, ds.TIME.data + 1) # We have to explicitly supply a dimension schema nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3)) assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data) dims = ds.dims chunks = ds.chunks if have_xarray: match = "'row': length 9 on 'ANTENNA4'" else: match = ("Existing dimension size 9 for dimension 'row' " "is inconsistent with same dimension 10 of array ANTENNA4") with pytest.raises(ValueError, match=match): array = da.zeros(dims['row'] - 1, chunks['row']) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.dims assert chunks['row'] == (10, ) if have_xarray: match = "Object has inconsistent chunks along dimension row." else: match = r"chunking \(4, 4, 2\) for dim" with pytest.raises(ValueError, match=match): array = da.zeros(dims['row'], chunks=4) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.chunks del datasets, ds, nds assert_liveness(0, 0)
def test_dataset_dask(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # All dask arrays for k, v in ds.data_vars.items(): assert isinstance(v.data, da.Array) # Test variable compute v2 = dask.compute(v)[0] assert isinstance(v2, xr.DataArray if have_xarray else Variable) assert isinstance(v2.data, np.ndarray) # Test variable persists v3 = dask.persist(v)[0] assert isinstance(v3, xr.DataArray if have_xarray else Variable) # Now have numpy array in the graph assert len(v3.data.__dask_keys__()) == 1 data = next(iter(v3.__dask_graph__().values())) assert isinstance(data, np.ndarray) assert_array_equal(v2.data, v3.data) # Test compute nds = dask.compute(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, np.ndarray) cdata = getattr(ds, k).data assert_array_equal(cdata, v.data) # Test persist nds = dask.persist(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, da.Array) # Now have numpy array iin the graph assert len(v.data.__dask_keys__()) == 1 data = next(iter(v.data.__dask_graph__().values())) assert isinstance(data, np.ndarray) cdata = getattr(ds, k).data assert_array_equal(cdata, v.data)
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset creation """ datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) # (1) Read-only TableProxy # (2) Read-only TAQL TableProxy assert_liveness(2, 1) chans = shapes['chan'] corrs = shapes['corr'] # Expected output chunks echunks = { 'chan': normalize_chunks(chunks.get('chan', chans), shape=(chans, ))[0], 'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0] } for ds in datasets: compute_dict = {} for k, v in ds.data_vars.items(): compute_dict[k] = v.data assert v.dtype == v.data.dtype res = dask.compute(compute_dict)[0] assert res['DATA'].shape[1:] == (chans, corrs) assert 'STATE_ID' in res assert 'TIME' in res chunks = ds.chunks assert chunks["chan"] == echunks['chan'] assert chunks["corr"] == echunks['corr'] dims = ds.dims dims.pop('row') # row changes assert dims == {"chan": shapes['chan'], "corr": shapes['corr']} del ds, datasets, compute_dict, v assert_liveness(0, 0)
def test_antenna_table_string_names(ant_table, wsrt_antenna_positions): ds = read_datasets(ant_table, [], [], None) assert len(ds) == 1 ds = ds[0] names = ["ANTENNA-%d" % i for i in range(wsrt_antenna_positions.shape[0])] assert_array_equal(ds.POSITION.data, wsrt_antenna_positions) assert_array_equal(ds.NAME.data, names) names = ds.NAME.data.compute() # Test that writing back string ndarrays work as # they must be converted from ndarrays to lists # of strings internally write_cols = set(ds.data_vars.keys()) - set(["ROWID"]) writes = write_datasets(ant_table, ds, write_cols) dask.compute(writes)
def test_dataset_add_string_column(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] dims = ds.dims name_list = ["BOB"] * dims['row'] names = np.asarray(name_list, dtype=np.object) names = da.from_array(names, chunks=ds.TIME.chunks) nds = ds.assign(NAMES=(("row", ), names)) writes = write_datasets(ms, nds, ["NAMES"]) dask.compute(writes) del datasets, ds, writes, nds assert_liveness(0, 0) with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T: assert name_list == T.getcol("NAMES")
def test_dataset_multidim_string_column(tmp_path, chunks): row = sum(chunks['row']) name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)] np_names = np.array(name_list, dtype=np.object) names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1])) ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)}) table_name = str(tmp_path / "test.table") writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"]) dask.compute(writes) del writes assert_liveness(0, 0) datasets = read_datasets(table_name, [], [], [], chunks={'row': chunks['row']}) assert len(datasets) == 1 assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names) del datasets assert_liveness(0, 0)
def test_dataset_assign(ms): """ Test dataset assignment """ datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # Assign on an existing column is easier because we can # infer the dimension schema from it nds = ds.assign(TIME=ds.TIME.data + 1) assert ds.DATA.data is nds.DATA.data assert ds.TIME.data is not nds.TIME.data assert_array_equal(nds.TIME.data, ds.TIME.data + 1) # This doesn't work for new columns with pytest.raises(ValueError, match="Couldn't find existing dimension"): ds.assign(ANTENNA3=ds.ANTENNA1.data + 3) # We have to explicitly supply a dimension schema nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3)) assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data) dims = ds.dims chunks = ds.chunks with pytest.raises(ValueError, match="size 9 for dimension 'row'"): array = da.zeros(dims['row'] - 1, chunks['row']) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.dims assert chunks['row'] == (10, ) with pytest.raises(ValueError, match=r"chunking \(4, 4, 2\) for dim"): array = da.zeros(dims['row'], chunks=4) nds = ds.assign(ANTENNA4=(("row", ), array)) nds.chunks del datasets, ds, nds assert_liveness(0, 0)
def test_dataset_dask(ms): datasets = read_datasets(ms, [], [], []) assert len(datasets) == 1 ds = datasets[0] # All dask arrays for k, v in ds.data_vars.items(): assert isinstance(v.data, da.Array) # Test variable compute v2 = dask.compute(v)[0] assert isinstance(v2, Variable) assert isinstance(v2.data, np.ndarray) # Test variable persists v3 = dask.persist(v)[0] assert isinstance(v3, Variable) # Now have numpy array in the graph assert len(v3.data.__dask_keys__()) == 1 assert isinstance(v3.data.__dask_graph__().values()[0], np.ndarray) # Test compute nds = dask.compute(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, np.ndarray) # Test persist nds = dask.persist(ds)[0] for k, v in nds.data_vars.items(): assert isinstance(v.data, da.Array) # Now have numpy array iin the graph assert len(v.data.__dask_keys__()) == 1 assert isinstance(v.data.__dask_graph__().values()[0], np.ndarray)
def test_dataset_table_schemas(ms): """ Test that we can pass table schemas """ data_dims = ("mychan", "mycorr") table_schema = ["MS", {"DATA": {"dims": data_dims}}] datasets = read_datasets(ms, [], [], [], table_schema=table_schema) assert datasets[0].data_vars["DATA"].dims == ("row", ) + data_dims