Ejemplo n.º 1
0
def test_dataset_updates(ms, select_cols,
                         group_cols, index_cols,
                         shapes, chunks):
    """ Test dataset writes """

    # Get original STATE_ID and DATA
    with pt.table(ms, ack=False, readonly=True, lockoptions='auto') as T:
        original_state_id = T.getcol("STATE_ID")
        original_data = T.getcol("DATA")

    try:
        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)
        assert_liveness(2, 1)

        # Test writes
        writes = []
        states = []
        datas = []

        # Create write operations and execute them
        for i, ds in enumerate(datasets):
            state_var = (("row",), ds.STATE_ID.data + 1)
            data_var = (("row", "chan", "corr"), ds.DATA.data + 1, {})
            states.append(state_var[1])
            datas.append(data_var[1])
            new_ds = ds.assign(STATE_ID=state_var, DATA=data_var)
            writes.append(write_datasets(ms, new_ds, ["STATE_ID", "DATA"]))

        _, states, datas = dask.compute(writes, states, datas)

        # NOTE(sjperkins)
        # Interesting behaviour here. If these objects are not
        # cleared up at this point, attempts to re-open the table below
        # can fail, reproducing https://github.com/ska-sa/dask-ms/issues/26
        # Adding auto-locking to the table opening command seems to fix
        # this somehow
        del ds, new_ds, datasets, writes, state_var, data_var
        assert_liveness(0, 0)

        datasets = read_datasets(ms, select_cols, group_cols,
                                 index_cols, chunks=chunks)

        for i, (ds, state, data) in enumerate(zip(datasets, states, datas)):
            assert_array_equal(ds.STATE_ID.data, state)
            assert_array_equal(ds.DATA.data, data)

        del ds, datasets
        assert_liveness(0, 0)
    finally:
        # Restore original STATE_ID
        with pt.table(ms, ack=False, readonly=False, lockoptions='auto') as T:
            state_id = T.getcol("STATE_ID")
            data = T.getcol("DATA")
            T.putcol("STATE_ID", original_state_id)
            T.putcol("DATA", original_data)

    # Compare against expected result
    assert_array_equal(original_state_id + 1, state_id)
    assert_array_equal(original_data + 1, data)
Ejemplo n.º 2
0
def test_dataset_add_column(ms, dtype):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # Create the dask array
    bitflag = da.zeros_like(ds.DATA.data, dtype=dtype)
    # Assign keyword attribute
    col_kw = {
        "BITFLAG": {
            'FLAGSETS': 'legacy,cubical',
            'FLAGSET_legacy': 1,
            'FLAGSET_cubical': 2
        }
    }
    # Assign variable onto the dataset
    nds = ds.assign(BITFLAG=(("row", "chan", "corr"), bitflag))
    writes = write_datasets(ms,
                            nds, ["BITFLAG"],
                            descriptor='ratt_ms',
                            column_keywords=col_kw)

    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        bf = T.getcol("BITFLAG")
        assert T.getcoldesc("BITFLAG")['keywords'] == col_kw['BITFLAG']
        assert bf.dtype == dtype
Ejemplo n.º 3
0
def test_dataset_schema(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    row, chan, corr = (ds.dims[d] for d in ("row", "chan", "corr"))
    cdata = np.random.random((row, chan, corr)).astype(np.complex64)

    ds = ds.assign(**{"CORRECTED_DATA": (("row", "chan", "corr"), cdata)})

    ds = ds.assign_coords(
        **{
            "row": ("row", np.arange(row)),
            "chan": ("chan", np.arange(chan)),
            "corr": ("corr", np.arange(corr)),
        })

    # We can shift between objects and dict representation
    ds_schema = DatasetSchema.from_dataset(ds)
    assert DatasetSchema.from_dict(ds_schema.to_dict()) == ds_schema

    # And the dict repr can go through JSON, although
    # we don't compare because JSON converts tuples to lists
    serialized = json.dumps(ds_schema.to_dict()).encode()
    DatasetSchema.from_dict(json.loads(serialized.decode()))
Ejemplo n.º 4
0
def test_dataset_numpy(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    row, chan, corr = (ds.dims[d] for d in ("row", "chan", "corr"))

    cdata = np.random.random((row, chan, corr)).astype(np.complex64)
    row_coord = np.arange(row)
    chan_coord = np.arange(chan)
    corr_coord = np.arange(corr)

    ds = ds.assign(**{"CORRECTED_DATA": (("row", "chan", "corr"), cdata)})

    ds = ds.assign_coords(
        **{
            "row": ("row", row_coord),
            "chan": ("chan", chan_coord),
            "corr": ("corr", corr_coord),
        })

    assert isinstance(ds.CORRECTED_DATA.data, np.ndarray)
    assert_array_equal(ds.CORRECTED_DATA.values, cdata)

    assert isinstance(ds.row.data, np.ndarray)
    assert_array_equal(ds.row.values, row_coord)
    assert isinstance(ds.chan.data, np.ndarray)
    assert_array_equal(ds.chan.values, chan_coord)
    assert isinstance(ds.corr.data, np.ndarray)
    assert_array_equal(ds.corr.values, corr_coord)

    nds = ds.compute()

    for k, v in nds.data_vars.items():
        assert_array_equal(v.data, getattr(ds, k).data)

    for k, v in nds.coords.items():
        assert_array_equal(v.data, getattr(ds, k).data)

    nds, = dask.compute(ds)

    for k, v in nds.data_vars.items():
        assert_array_equal(v.data, getattr(ds, k).data)

    for k, v in nds.coords.items():
        assert_array_equal(v.data, getattr(ds, k).data)

    nds, = dask.persist(ds)

    for k, v in nds.data_vars.items():
        assert_array_equal(v.data, getattr(ds, k).data)

    for k, v in nds.coords.items():
        assert_array_equal(v.data, getattr(ds, k).data)
Ejemplo n.º 5
0
def test_row_grouping(spw_table, spw_chan_freqs, chunks):
    """ Test grouping on single rows """
    datasets = read_datasets(spw_table, [], ["__row__"], [], chunks=chunks)

    assert_liveness(2, 1)

    assert len(datasets) == len(spw_chan_freqs)

    for i, chan_freq in enumerate(spw_chan_freqs):
        assert_array_equal(datasets[i].CHAN_FREQ.data[0], chan_freq)
        assert_array_equal(datasets[i].NUM_CHAN.data[0], chan_freq.shape[0])

    del datasets
    assert_liveness(0, 0)
Ejemplo n.º 6
0
def test_stress(big_ms, iterations, chunks):
    datasets = read_datasets(big_ms, ["TIME", "DATA"],
                             ["FIELD_ID", "DATA_DESC_ID"], [],
                             chunks=chunks)

    assert len(datasets) == 1
    ds = datasets[0]

    writes = []

    for i in range(iterations):
        nds = ds.assign(TIME=(("row", ), ds.TIME.data + i),
                        DATA=(("row", "chan", "corr"), ds.DATA.data + i))
        writes.append(write_datasets(big_ms, nds, ["TIME", "DATA"]))

    dask.compute(writes)
Ejemplo n.º 7
0
def test_dataset_computes_and_values(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # All dask arrays
    for k, v in ds.data_vars.items():
        assert isinstance(v.data, da.Array)

    nds = ds.compute()

    # Now we have numpy arrays that match original data
    for k, v in nds.data_vars.items():
        assert isinstance(v.data, np.ndarray)
        assert_array_equal(v.data, ds.data_vars[k].data)
        assert_array_equal(v.values, ds.data_vars[k].data)
Ejemplo n.º 8
0
def test_dataset_assign(ms):
    """ Test dataset assignment """
    datasets = read_datasets(ms, [], [], [])

    assert len(datasets) == 1
    ds = datasets[0]

    # Assign on an existing column is easier because we can
    # infer the dimension schema from it
    nds = ds.assign(TIME=(ds.TIME.dims, ds.TIME.data + 1))
    assert ds.DATA.data is nds.DATA.data
    assert ds.TIME.data is not nds.TIME.data
    assert_array_equal(nds.TIME.data, ds.TIME.data + 1)

    # We have to explicitly supply a dimension schema
    nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3))
    assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data)

    dims = ds.dims
    chunks = ds.chunks

    if have_xarray:
        match = "'row': length 9 on 'ANTENNA4'"
    else:
        match = ("Existing dimension size 9 for dimension 'row' "
                 "is inconsistent with same dimension 10 of array ANTENNA4")

    with pytest.raises(ValueError, match=match):
        array = da.zeros(dims['row'] - 1, chunks['row'])
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.dims

    assert chunks['row'] == (10, )

    if have_xarray:
        match = "Object has inconsistent chunks along dimension row."
    else:
        match = r"chunking \(4, 4, 2\) for dim"

    with pytest.raises(ValueError, match=match):
        array = da.zeros(dims['row'], chunks=4)
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.chunks

    del datasets, ds, nds
    assert_liveness(0, 0)
Ejemplo n.º 9
0
def test_dataset_dask(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # All dask arrays
    for k, v in ds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Test variable compute
        v2 = dask.compute(v)[0]
        assert isinstance(v2, xr.DataArray if have_xarray else Variable)
        assert isinstance(v2.data, np.ndarray)

        # Test variable persists
        v3 = dask.persist(v)[0]
        assert isinstance(v3, xr.DataArray if have_xarray else Variable)

        # Now have numpy array in the graph
        assert len(v3.data.__dask_keys__()) == 1
        data = next(iter(v3.__dask_graph__().values()))
        assert isinstance(data, np.ndarray)
        assert_array_equal(v2.data, v3.data)

    # Test compute
    nds = dask.compute(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, np.ndarray)
        cdata = getattr(ds, k).data
        assert_array_equal(cdata, v.data)

    # Test persist
    nds = dask.persist(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Now have numpy array iin the graph
        assert len(v.data.__dask_keys__()) == 1
        data = next(iter(v.data.__dask_graph__().values()))
        assert isinstance(data, np.ndarray)

        cdata = getattr(ds, k).data
        assert_array_equal(cdata, v.data)
Ejemplo n.º 10
0
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks):
    """ Test dataset creation """
    datasets = read_datasets(ms,
                             select_cols,
                             group_cols,
                             index_cols,
                             chunks=chunks)
    # (1) Read-only TableProxy
    # (2) Read-only TAQL TableProxy
    assert_liveness(2, 1)

    chans = shapes['chan']
    corrs = shapes['corr']

    # Expected output chunks
    echunks = {
        'chan': normalize_chunks(chunks.get('chan', chans),
                                 shape=(chans, ))[0],
        'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0]
    }

    for ds in datasets:
        compute_dict = {}

        for k, v in ds.data_vars.items():
            compute_dict[k] = v.data
            assert v.dtype == v.data.dtype

        res = dask.compute(compute_dict)[0]

        assert res['DATA'].shape[1:] == (chans, corrs)
        assert 'STATE_ID' in res
        assert 'TIME' in res

        chunks = ds.chunks
        assert chunks["chan"] == echunks['chan']
        assert chunks["corr"] == echunks['corr']

        dims = ds.dims
        dims.pop('row')  # row changes
        assert dims == {"chan": shapes['chan'], "corr": shapes['corr']}

    del ds, datasets, compute_dict, v
    assert_liveness(0, 0)
Ejemplo n.º 11
0
def test_antenna_table_string_names(ant_table, wsrt_antenna_positions):
    ds = read_datasets(ant_table, [], [], None)
    assert len(ds) == 1
    ds = ds[0]

    names = ["ANTENNA-%d" % i for i in range(wsrt_antenna_positions.shape[0])]

    assert_array_equal(ds.POSITION.data, wsrt_antenna_positions)
    assert_array_equal(ds.NAME.data, names)

    names = ds.NAME.data.compute()

    # Test that writing back string ndarrays work as
    # they must be converted from ndarrays to lists
    # of strings internally
    write_cols = set(ds.data_vars.keys()) - set(["ROWID"])
    writes = write_datasets(ant_table, ds, write_cols)

    dask.compute(writes)
Ejemplo n.º 12
0
def test_dataset_add_string_column(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]
    dims = ds.dims

    name_list = ["BOB"] * dims['row']
    names = np.asarray(name_list, dtype=np.object)
    names = da.from_array(names, chunks=ds.TIME.chunks)

    nds = ds.assign(NAMES=(("row", ), names))

    writes = write_datasets(ms, nds, ["NAMES"])
    dask.compute(writes)

    del datasets, ds, writes, nds
    assert_liveness(0, 0)

    with pt.table(ms, readonly=False, ack=False, lockoptions='auto') as T:
        assert name_list == T.getcol("NAMES")
Ejemplo n.º 13
0
def test_dataset_multidim_string_column(tmp_path, chunks):
    row = sum(chunks['row'])

    name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)]
    np_names = np.array(name_list, dtype=np.object)
    names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1]))

    ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)})
    table_name = str(tmp_path / "test.table")
    writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"])
    dask.compute(writes)

    del writes
    assert_liveness(0, 0)

    datasets = read_datasets(table_name, [], [], [],
                             chunks={'row': chunks['row']})
    assert len(datasets) == 1
    assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names)
    del datasets
    assert_liveness(0, 0)
Ejemplo n.º 14
0
def test_dataset_assign(ms):
    """ Test dataset assignment """
    datasets = read_datasets(ms, [], [], [])

    assert len(datasets) == 1
    ds = datasets[0]

    # Assign on an existing column is easier because we can
    # infer the dimension schema from it
    nds = ds.assign(TIME=ds.TIME.data + 1)
    assert ds.DATA.data is nds.DATA.data
    assert ds.TIME.data is not nds.TIME.data
    assert_array_equal(nds.TIME.data, ds.TIME.data + 1)

    # This doesn't work for new columns
    with pytest.raises(ValueError, match="Couldn't find existing dimension"):
        ds.assign(ANTENNA3=ds.ANTENNA1.data + 3)

    # We have to explicitly supply a dimension schema
    nds = ds.assign(ANTENNA3=(("row", ), ds.ANTENNA1.data + 3))
    assert_array_equal(ds.ANTENNA1.data + 3, nds.ANTENNA3.data)

    dims = ds.dims
    chunks = ds.chunks

    with pytest.raises(ValueError, match="size 9 for dimension 'row'"):
        array = da.zeros(dims['row'] - 1, chunks['row'])
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.dims

    assert chunks['row'] == (10, )

    with pytest.raises(ValueError, match=r"chunking \(4, 4, 2\) for dim"):
        array = da.zeros(dims['row'], chunks=4)
        nds = ds.assign(ANTENNA4=(("row", ), array))
        nds.chunks

    del datasets, ds, nds
    assert_liveness(0, 0)
Ejemplo n.º 15
0
def test_dataset_dask(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # All dask arrays
    for k, v in ds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Test variable compute
        v2 = dask.compute(v)[0]
        assert isinstance(v2, Variable)
        assert isinstance(v2.data, np.ndarray)

        # Test variable persists
        v3 = dask.persist(v)[0]
        assert isinstance(v3, Variable)

        # Now have numpy array in the graph
        assert len(v3.data.__dask_keys__()) == 1
        assert isinstance(v3.data.__dask_graph__().values()[0], np.ndarray)

    # Test compute
    nds = dask.compute(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, np.ndarray)

    # Test persist
    nds = dask.persist(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Now have numpy array iin the graph
        assert len(v.data.__dask_keys__()) == 1
        assert isinstance(v.data.__dask_graph__().values()[0], np.ndarray)
Ejemplo n.º 16
0
def test_dataset_table_schemas(ms):
    """ Test that we can pass table schemas """
    data_dims = ("mychan", "mycorr")
    table_schema = ["MS", {"DATA": {"dims": data_dims}}]
    datasets = read_datasets(ms, [], [], [], table_schema=table_schema)
    assert datasets[0].data_vars["DATA"].dims == ("row", ) + data_dims