def test_dataset_create_table(tmp_path, dataset_chunks, dtype): datasets = [] names = [] datas = [] row_sum = 0 for chunks in dataset_chunks: shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # Make some visibilities dims = ("row", "chan", "corr") shape = tuple(shapes[d] for d in dims) data_chunks = tuple(chunks[d] for d in dims) data = da.random.random(shape, chunks=data_chunks).astype(dtype) data_var = Variable(dims, data, {}) # Make some string names dims = ("row", ) shape = tuple(shapes[d] for d in dims) str_chunks = tuple(chunks[d] for d in dims) np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object) da_str_array = da.from_array(np_str_array, chunks=str_chunks) str_array_var = Variable(dims, da_str_array, {}) datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var})) datas.append(data) names.extend(np_str_array.tolist()) freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16) sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})] # Write the data to new tables table_name = os.path.join(str(tmp_path), 'test.table') writes = write_datasets(table_name, datasets, ["DATA", "NAMES"]) subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"]) dask.compute(writes, subt_writes) # Check written data with pt.table(table_name, readonly=True, lockoptions='auto', ack=False) as T: assert row_sum == T.nrows() assert_array_equal(T.getcol("DATA"), np.concatenate(datas)) assert_array_equal(T.getcol("NAMES"), names) # Sub-table correctly linked and populated with pt.table(table_name + "::SPW", readonly=True, lockoptions='auto', ack=False) as T: assert T.nrows() == 1 assert_array_equal(T.getcol("FREQ")[0], freq)
def test_variable_column_descriptor(chunks, dtype, tmp_path): column_meta = [] shapes = {k: sum(c) for k, c in chunks.items()} # Make some visibilities dims = ("row", "chan", "corr") shape = tuple(shapes[d] for d in dims) data_chunks = tuple(chunks[d] for d in dims) data = da.random.random(shape, chunks=data_chunks).astype(dtype) data_var = Variable(dims, data, {}) meta = variable_column_descriptor("DATA", data_var) column_meta.append({"name": "DATA", "desc": meta}) # Make some string names dims = ("row", ) shape = tuple(shapes[d] for d in dims) str_chunks = tuple(chunks[d] for d in dims) np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object) da_str_array = da.from_array(np_str_array, chunks=str_chunks) str_array_var = Variable(dims, da_str_array, {}) meta = variable_column_descriptor("NAMES", str_array_var) column_meta.append({"name": "NAMES", "desc": meta}) # Create a new table with the column metadata fn = os.path.join(str(tmp_path), "test.ms") tabdesc = pt.maketabdesc(column_meta) with pt.table(fn, tabdesc, readonly=False, ack=False) as T: # Add rows T.addrows(shapes['row']) str_list = np_str_array.tolist() # Put data T.putcol("DATA", data.compute()) T.putcol("NAMES", str_list) # We get out what we put in assert_array_equal(T.getcol("NAMES"), str_list) assert_array_equal(T.getcol("DATA"), data)
def test_ms_subtable_builder(tmp_path, table): A = da.zeros((10, 20, 30), chunks=(2, 20, 30), dtype=np.int32) variables = {"FOO": Variable(("row", "chan", "corr"), A, {})} var_names = set(variables.keys()) builder = MSSubTableDescriptorBuilder(table) default_desc = builder.default_descriptor() tab_desc = builder.descriptor(variables, default_desc) dminfo = builder.dminfo(tab_desc) # These columns must always be present on an MS required_cols = { k for k in pt.required_ms_desc(table).keys() if not k.startswith('_') } filename = str(tmp_path / (f"{table}.table")) with pt.table(filename, tab_desc, dminfo=dminfo, ack=False) as T: T.addrows(10) # We got required + the extra columns we asked for assert set(T.colnames()) == set.union(var_names, required_cols)
def _variable_factory(dims, dtype): shape = tuple(sum(chunks[d]) for d in dims) achunks = tuple(chunks[d] for d in dims) dask_array = da.random.random(shape, chunks=achunks).astype(dtype) return Variable(dims, dask_array, {})
def xds_from_zarr(store, columns=None, chunks=None, **kwargs): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset **kwargs: optional Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY # expensive if the metadata has not been consolidated. zc.consolidate_metadata(store.map) table_path = store.table if store.table else "MAIN" table_group = zarr.open_consolidated(store.map)[table_path] for g, (group_name, group) in enumerate(sorted(table_group.groups(), key=group_sortkey)): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: group_chunks.update(chunks[-1]) # Reuse last chunking. pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets
def xds_from_zarr(store, columns=None, chunks=None): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ store, table = store_path_split(store) if isinstance(store, Path): store = str(store) if not isinstance(store, str): raise TypeError("store must be a Path, str") columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] table_group = zarr.open(store)[table] for g, (group_name, group) in enumerate(sorted(table_group.groups())): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets