def test_dataset_create_table(tmp_path, dataset_chunks, dtype): datasets = [] names = [] datas = [] row_sum = 0 for chunks in dataset_chunks: shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # Make some visibilities dims = ("row", "chan", "corr") shape = tuple(shapes[d] for d in dims) data_chunks = tuple(chunks[d] for d in dims) data = da.random.random(shape, chunks=data_chunks).astype(dtype) data_var = Variable(dims, data, {}) # Make some string names dims = ("row", ) shape = tuple(shapes[d] for d in dims) str_chunks = tuple(chunks[d] for d in dims) np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object) da_str_array = da.from_array(np_str_array, chunks=str_chunks) str_array_var = Variable(dims, da_str_array, {}) datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var})) datas.append(data) names.extend(np_str_array.tolist()) freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16) sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})] # Write the data to new tables table_name = os.path.join(str(tmp_path), 'test.table') writes = write_datasets(table_name, datasets, ["DATA", "NAMES"]) subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"]) dask.compute(writes, subt_writes) # Check written data with pt.table(table_name, readonly=True, lockoptions='auto', ack=False) as T: assert row_sum == T.nrows() assert_array_equal(T.getcol("DATA"), np.concatenate(datas)) assert_array_equal(T.getcol("NAMES"), names) # Sub-table correctly linked and populated with pt.table(table_name + "::SPW", readonly=True, lockoptions='auto', ack=False) as T: assert T.nrows() == 1 assert_array_equal(T.getcol("FREQ")[0], freq)
def test_write_dict_data(tmp_path, chunks, dtype): rs = np.random.RandomState(42) row_sum = 0 def _vis_factory(chan, corr): # Variably sized-channels per row, as in BDA data nchan = rs.randint(chan) return (rs.normal(size=(1, nchan, corr)) + rs.normal(size=(1, nchan, corr))*1j) shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # assert len(chunks['chan']) == 1 assert len(chunks['corr']) == 1 # Make some visibilities dims = ("row", "chan", "corr") row, chan, corr = (shapes[d] for d in dims) name = "vis-data-" + uuid.uuid4().hex nchunks = (len(chunks[d]) for d in dims) keys = product((name,), *(range(c) for c in nchunks)) chunk_sizes = product(*(chunks[d] for d in dims)) layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr) for i in range(r)} for k, (r, _, _) in zip(keys, chunk_sizes)} hlg = HighLevelGraph.from_collections(name, layer, []) chunks = tuple(chunks[d] for d in dims) meta = np.empty((0,)*len(chunks), dtype=np.complex128) vis = da.Array(hlg, name, chunks, meta=meta) ds = Dataset({"DATA": (dims, vis)}) table_name = os.path.join(str(tmp_path), 'test.table') writes, table_proxy = write_datasets(table_name, ds, ["DATA"], table_proxy=True, # No fixed shape columns descriptor="ms(False)") dask.compute(writes) data = table_proxy.getvarcol("DATA").result() # First row chunk assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5']) # Second row chunk assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8']) # Third row chunk assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9']) assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
def test_xds_to_zarr_coords(tmp_path_factory): zarr_store = tmp_path_factory.mktemp("zarr_coords") / "test.zarr" data = da.ones((100, 16, 4), chunks=(10, 4, 1), dtype=np.complex64) rowid = da.arange(100, chunks=10) data_vars = {"DATA": (("row", "chan", "corr"), data)} coords = { "ROWID": (("row",), rowid), "chan": (("chan",), np.arange(16)), "foo": (("foo",), np.arange(4)), } ds = [Dataset(data_vars, coords=coords)] writes = xds_to_zarr(ds, zarr_store) dask.compute(writes) rds = xds_from_zarr(zarr_store) assert len(ds) == len(rds) for ods, nds in zip(ds, rds): for c, v in ods.data_vars.items(): assert_array_equal(v.data, getattr(nds, c).data) for c, v in ods.coords.items(): assert_array_equal(v.data, getattr(nds, c).data)
def test_basic_roundtrip(tmp_path): path = tmp_path / "test.zarr" # We need >10 datasets to be sure roundtripping is consistent. xdsl = [Dataset({'x': (('y',), da.ones(i))}) for i in range(1, 12)] dask.compute(xds_to_zarr(xdsl, path)) xdsl = xds_from_zarr(path) dask.compute(xds_to_zarr(xdsl, path))
def _group_datasets(self, table_proxy, groups, exemplar_rows, orders): _, t, s = table_path_split(self.canonical_name) short_table_name = '/'.join((t, s)) if s else t table_schema = self._table_schema() datasets = [] group_ids = list(zip(*groups)) assert len(group_ids) == len(orders) # Select columns, excluding grouping columns select_cols = set(self.select_cols or table_proxy.colnames().result()) select_cols -= set(self.group_cols) # Create a dataset for each group it = enumerate(zip(group_ids, exemplar_rows, orders)) for g, (group_id, exemplar_row, order) in it: # Extract group chunks try: group_chunks = self.chunks[g] # Get group chunking strategy except IndexError: group_chunks = self.chunks[-1] # Re-use last group's chunks # Prefix dataset gid_str = ",".join(str(gid) for gid in group_id) array_suffix = f"[{gid_str}]-{short_table_name}" # Create dataset variables group_var_dims = _dataset_variable_factory( table_proxy, table_schema, select_cols, exemplar_row, order, group_chunks, array_suffix) # Extract ROWID try: rowid = group_var_dims.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} # Assign values for the dataset's grouping columns # as attributes partitions = tuple( (c, g.dtype.name) for c, g in zip(self.group_cols, group_id)) attrs = {DASKMS_PARTITION_KEY: partitions} # Use python types which are json serializable group_id = [gid.item() for gid in group_id] attrs.update(zip(self.group_cols, group_id)) datasets.append(Dataset(group_var_dims, attrs=attrs, coords=coords)) return datasets
def _group_datasets(self, groups, exemplar_rows, orders): _, t, s = table_path_split(self.canonical_name) short_table_name = '/'.join((t, s)) if s else t table_proxy = self._table_proxy() table_schema = self._table_schema() datasets = [] group_ids = list(zip(*groups)) assert len(group_ids) == len(orders) # Select columns, excluding grouping columns select_cols = set(self.select_cols or table_proxy.colnames().result()) select_cols -= set(self.group_cols) # Create a dataset for each group it = enumerate(zip(group_ids, exemplar_rows, orders)) for g, (group_id, exemplar_row, order) in it: # Extract group chunks try: group_chunks = self.chunks[g] # Get group chunking strategy except IndexError: group_chunks = self.chunks[-1] # Re-use last group's chunks # Prefix d gid_str = ",".join(str(gid) for gid in group_id) array_prefix = "%s-[%s]" % (short_table_name, gid_str) # Create dataset variables group_var_dims = _dataset_variable_factory( table_proxy, table_schema, select_cols, exemplar_row, order, group_chunks, array_prefix) # Extract ROWID try: rowid = group_var_dims.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} # Assign values for the dataset's grouping columns # as attributes attrs = dict(zip(self.group_cols, group_id)) datasets.append(Dataset(group_var_dims, attrs=attrs, coords=coords)) return datasets
def test_zarr_string_array(tmp_path_factory): zarr_store = tmp_path_factory.mktemp("string-arrays") / "test.zarr" data = ["hello", "this", "strange new world", "full of", "interesting", "stuff"] data = np.array(data, dtype=object).reshape(3, 2) data = da.from_array(data, chunks=((2, 1), (1, 1))) datasets = [Dataset({"DATA": (("x", "y"), data)})] writes = xds_to_zarr(datasets, zarr_store) dask.compute(writes) new_datasets = xds_from_zarr(zarr_store) assert len(new_datasets) == len(datasets) for nds, ds in zip(new_datasets, datasets): assert_array_equal(nds.DATA.data, ds.DATA.data)
def _single_dataset(self, table_proxy, orders, exemplar_row=0): _, t, s = table_path_split(self.canonical_name) short_table_name = "/".join((t, s)) if s else t table_schema = self._table_schema() select_cols = set(self.select_cols or table_proxy.colnames().result()) variables = _dataset_variable_factory(table_proxy, table_schema, select_cols, exemplar_row, orders, self.chunks[0], short_table_name) try: rowid = variables.pop("ROWID") except KeyError: coords = None else: coords = {"ROWID": rowid} return Dataset(variables, coords=coords)
def test_xds_to_parquet_string(tmp_path_factory): store = tmp_path_factory.mktemp("parquet_store") / "string-dataset.parquet" datasets = [] for i in range(3): names = random.choices([f"foo-{i}", f"bar-{i}", f"qux-{i}"], k=10) names = np.asarray(names, dtype=object) chunks = sorted([1, 2, 3, 4], key=lambda *a: random.random()) names = da.from_array(names, chunks=chunks) datasets.append(Dataset({"NAME": (("row", ), names)})) writes = xds_to_parquet(datasets, store) dask.compute(writes) parquet_datasets = xds_from_parquet(store) assert len(datasets) == len(parquet_datasets) for ds, pq_ds in zip(datasets, parquet_datasets): assert_array_equal(ds.NAME.data, pq_ds.NAME.data)
def test_dataset_multidim_string_column(tmp_path, chunks): row = sum(chunks['row']) name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)] np_names = np.array(name_list, dtype=np.object) names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1])) ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)}) table_name = str(tmp_path / "test.table") writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"]) dask.compute(writes) del writes assert_liveness(0, 0) datasets = read_datasets(table_name, [], [], [], chunks={'row': chunks['row']}) assert len(datasets) == 1 assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names) del datasets assert_liveness(0, 0)
def xds_to_parquet(xds, store, columns=None, **kwargs): if isinstance(store, DaskMSStore): pass elif isinstance(store, (str, Path)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_to_parquet: {kwargs}", UserWarning) columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") datasets = [] base_schema = ArrowSchema.from_datasets(xds) for ds_id, ds in enumerate(xds): arrow_schema = base_schema.with_attributes(ds) fragment = ParquetFragment(store, store.table, arrow_schema, ds_id) chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1) args = [chunk_ids, ("row", )] data_var_it = column_iterator(ds.data_vars, columns) coord_it = column_iterator(ds.coords, columns) for column, variable in itertools.chain(data_var_it, coord_it): if not isinstance(variable.data, da.Array): raise ValueError(f"Column {column} does not " f"contain a dask Array") if len(variable.dims[0]) == 0 or variable.dims[0] != "row": raise ValueError(f"Column {column} dimensions " f"{variable.dims} don't start with 'row'") args.extend((column, None, variable.data, variable.dims)) for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]): if len(chunk) != 1: raise ValueError(f"Chunking in {dim} is not yet " f"supported.") writes = da.blockwise(fragment.write, ("row", ), *args, align_arrays=False, adjust_chunks={"row": 1}, meta=np.empty((0, ), bool)) writes = inlined_array(writes, chunk_ids) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs)) return datasets
def xds_to_table(xds, table_name, columns, descriptor=None, table_keywords=None, column_keywords=None, table_proxy=False): """ Generates a list of Datasets representing a write operations from the specified arrays in :class:`xarray.Dataset`'s into the CASA table columns specified by ``table_name`` and ``columns``. This is lazy operation -- it is only execute when a :meth:`dask.compute` or :meth:`xarray.Dataset.compute` method is called. Parameters ---------- xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset` dataset(s) containing the specified columns. If a list of datasets is provided, the concatenation of the columns in sequential datasets will be written. table_name : str CASA table path columns : tuple or list or "ALL" list of column names to write to the table. "ALL" is a special marker which specifies that all columns should be written. If you wish to write an "ALL" array to a column, use :code:`columns=['ALL']` descriptor : None or \ :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \ str A class describing how CASA table descriptors and data managers are constructors. Some defaults are available such as `ms` and `ms_subtable`. If None, defaults are used. table_keywords : dict, optional Dictionary of table keywords to add to existing keywords. The operation is performed immediately, not lazily. column_keywords : dict, optional Dictionary of :code:`{column: keywords}` to add to existing column keywords. The operation is performed immediately, not lazily. table_proxy : {False, True} If True returns the table_proxy Returns ------- write_datasets : list of :class:`xarray.Dataset` Datasets containing arrays representing write operations into a CASA Table table_proxy : :class:`daskms.TableProxy`, optional The Table Proxy associated with the datasets """ # Promote dataset to a list if not isinstance(xds, (tuple, list)): xds = [xds] if not isinstance(columns, (tuple, list)): if columns != "ALL": columns = [columns] datasets = [] # No xarray available, assume dask datasets if xr is None: datasets = xds else: for ds in xds: if isinstance(ds, Dataset): # Already a dask dataset datasets.append(ds) elif isinstance(ds, xr.Dataset): # Produce a list of internal variable and dataset types # from the xarray Dataset variables = { k: (v.dims, v.data, v.attrs) for k, v in ds.data_vars.items() } coords = { k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items() } dds = Dataset(variables, attrs=ds.attrs, coords=coords) datasets.append(dds) else: raise TypeError("Invalid Dataset type '%s'" % type(ds)) # Write the datasets out_ds = write_datasets(table_name, datasets, columns, descriptor=descriptor, table_keywords=table_keywords, column_keywords=column_keywords, table_proxy=table_proxy) # No xarray available assume dask datasets if xr is None: return out_ds # Unpack table proxy if it was requested if table_proxy is True: assert isinstance(out_ds, tuple) out_ds, tp = out_ds assert isinstance(tp, TableProxy) else: tp = None if isinstance(out_ds, Dataset): out_ds = [out_ds] elif isinstance(out_ds, (tuple, list)): pass else: raise TypeError("Invalid Dataset type '%s'" % type(out_ds)) xformed_out_ds = [] for ds in out_ds: assert isinstance(ds, Dataset) variables = { k: (v.dims, v.data, v.attrs) for k, v in ds.data_vars.items() } coords = {k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items()} xformed_out_ds.append( xr.Dataset(variables, coords=coords, attrs=ds.attrs)) if len(xformed_out_ds) == 0: return xr.Dataset() elif len(xformed_out_ds) == 1: xformed_out_ds = xformed_out_ds[0] # Repack the Table Proxy if table_proxy is True: return xformed_out_ds, tp return xformed_out_ds
def _write_datasets(table, table_proxy, datasets, columns, descriptor, table_keywords, column_keywords): _, table_name, subtable = table_path_split(table) table_name = '::'.join((table_name, subtable)) if subtable else table_name row_orders = [] # Put table and column keywords table_proxy.submit(_put_keywords, WRITELOCK, table_keywords, column_keywords).result() # Sort datasets on (not has "ROWID", index) such that # datasets with ROWID's are handled first, while # those without (which imply appends to the MS) # are handled last sorted_datasets = sorted(enumerate(datasets), key=lambda t: ("ROWID" not in t[1].data_vars, t[0])) # Establish row orders for each dataset for di, ds in sorted_datasets: try: rowid = ds.ROWID.data except AttributeError: # Add operation # No ROWID's, assume they're missing from the table # and remaining datasets. Generate addrows # NOTE(sjperkins) # This could be somewhat brittle, but exists to # update MS empty subtables once they've been # created along with the main MS by a call to default_ms. # Users could also it to append rows to an existing table. # An xds_append_to_table may be a better solution... last_datasets = datasets[di:] last_row_orders = add_row_order_factory(table_proxy, last_datasets) # We don't inline the row ordering if it is derived # from the row sizes of provided arrays. # The range of possible dependencies are far too large to inline row_orders.extend([(False, lro) for lro in last_row_orders]) # We have established row orders for all datasets # at this point, quit the loop break else: # Update operation # Generate row orderings from existing row IDs row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) # TODO(sjperkins) # There's an assumption here that rowid is an # operation with minimal dependencies # (i.e. derived from xds_from_{ms, table}) # Caching flattens the graph into a single layer if len(row_order.__dask_graph__().layers) > 1: log.warning("Caching an update row ordering " "with more than one layer") row_order = cached_array(row_order) # Inline the row ordering in the graph row_orders.append((True, row_order)) assert len(row_orders) == len(datasets) datasets = [] for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders): # Hold the variables representing array writes write_vars = {} # Generate a dask array for each column for column in columns: try: variable = ds.data_vars[column] except KeyError: log.warning("Ignoring '%s' not present " "on dataset %d" % (column, di)) continue else: full_dims = variable.dims array = variable.data if not isinstance(array, da.Array): raise TypeError("%s on dataset %d is not a dask Array " "but a %s" % (column, di, type(array))) args = [row_order, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can putcol, otherwise putcolslice is required if not all(len(c) == 1 for c in array.chunks[1:]): # Add extent arrays for d, c in zip(full_dims[1:], array.chunks[1:]): args.append(dim_extents_array(d, c)) args.append((d, )) # Add other variables args.extend([table_proxy, None, column, None, array, full_dims]) # Name of the dask array representing this column token = dask.base.tokenize(di, args) name = "-".join((table_name, 'write', column, token)) write_col = da.blockwise( putter_wrapper, full_dims, *args, # All dims shrink to 1, # a single bool is returned adjust_chunks={d: 1 for d in full_dims}, name=name, align_arrays=False, dtype=np.bool) if inline: write_col = inlined_array(write_col, [row_order]) write_vars[column] = (full_dims, write_col) # Append a dataset with the write operations datasets.append(Dataset(write_vars)) # Return an empty dataset if len(datasets) == 0: return Dataset({}) # Return singleton elif len(datasets) == 1: return datasets[0] return datasets
def xds_from_zarr(store, columns=None, chunks=None): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ store, table = store_path_split(store) if isinstance(store, Path): store = str(store) if not isinstance(store, str): raise TypeError("store must be a Path, str") columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] table_group = zarr.open(store)[table] for g, (group_name, group) in enumerate(sorted(table_group.groups())): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets
def xds_from_parquet(store, columns=None, chunks=None): store, table = store_path_split(store) store = store / table if not isinstance(store, Path): store = Path(store) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if len(chunks) == 0 or any(not isinstance(c, dict) for c in chunks): raise TypeError("chunks must be None or dict or list of dict") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None or dict or list of dict") fragments = store.rglob("*.parquet") ds_cfg = defaultdict(list) # Iterate over all parquet files in the directory tree # and group them by partition partition_schemas = set() for fragment in fragments: *partitions, parquet_file = fragment.relative_to(store).parts fragment = ParquetFileProxy(fragment) fragment_meta = fragment.metadata metadata = json.loads(fragment_meta.metadata[DASKMS_METADATA.encode()]) partition_meta = metadata[DASKMS_PARTITION_KEY] partition_meta = tuple(tuple((f, v)) for f, v in partition_meta) partitions = _partition_values(partitions, partition_meta) partition_schemas.add(partition_meta) ds_cfg[partitions].append(fragment) # Sanity check partition schemas of all parquet files if len(partition_schemas) == 0: raise ValueError(f"No parquet files found in {store}") elif len(partition_schemas) != 1: raise ValueError(f"Multiple partitions discovered {partition_schemas}") partition_schemas = partition_schemas.pop() datasets = [] # Now create a dataset per partition for p, (partition, fragments) in enumerate(sorted(ds_cfg.items())): fragments = list(sorted(fragments)) column_arrays = defaultdict(list) fragment_rows = [f.metadata.num_rows for f in fragments] for (f, (start, end)) in partition_chunking(p, fragment_rows, chunks): fragment = fragments[f] fragment_meta = fragment.metadata rows = fragment_meta.num_rows schema = fragment_meta.schema.to_arrow_schema() fields = {n: schema.field(n) for n in schema.names} for column, field in column_iterator(fields, columns): field_metadata = field.metadata[DASKMS_METADATA.encode()] field_metadata = json.loads(field_metadata) dims = tuple(field_metadata["dims"]) if isinstance(field.type, TensorType): shape = (rows,) + field.type.shape else: shape = (rows,) assert len(shape) == len(dims) meta = np.empty((0,)*len(dims), field.type.to_pandas_dtype()) new_axes = {d: s for d, s in zip(dims, shape)} read = da.blockwise(fragment.read_column, dims, column, None, start, None, end, None, new_axes=new_axes, meta=meta) column_arrays[column].append((read, dims)) data_vars = {} for column, values in column_arrays.items(): arrays, array_dims = zip(*values) array_dims = set(array_dims) if not len(array_dims) == 1: raise ValueError(f"Inconsistent array dimensions " f"{array_dims} for {column}") data_vars[column] = (array_dims.pop(), da.concatenate(arrays)) attrs = dict(partition) attrs[DASKMS_PARTITION_KEY] = partition_schemas datasets.append(Dataset(data_vars, attrs=attrs)) return datasets
def xds_from_parquet(store, columns=None, chunks=None, **kwargs): if isinstance(store, DaskMSStore): pass elif isinstance(store, (str, Path)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_parquet: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if len(chunks) == 0 or any(not isinstance(c, dict) for c in chunks): raise TypeError("chunks must be None or dict or list of dict") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None or dict or list of dict") table_path = "" if store.table else "MAIN" fragments = list(map(Path, store.rglob("*.parquet"))) ds_cfg = defaultdict(list) # Iterate over all parquet files in the directory tree # and group them by partition partition_schemas = set() for fragment in fragments: *partitions, _ = fragment.relative_to(Path(table_path)).parts fragment = ParquetFileProxy(store, str(fragment)) fragment_meta = fragment.metadata metadata = json.loads(fragment_meta.metadata[DASKMS_METADATA.encode()]) partition_meta = metadata[DASKMS_PARTITION_KEY] partition_meta = tuple(tuple((f, v)) for f, v in partition_meta) partitions = _partition_values(partitions, partition_meta) partition_schemas.add(partition_meta) ds_cfg[partitions].append(fragment) # Sanity check partition schemas of all parquet files if len(partition_schemas) == 0: raise ValueError(f"No parquet files found in {store.path}") elif len(partition_schemas) != 1: raise ValueError(f"Multiple partitions discovered {partition_schemas}") partition_schemas = partition_schemas.pop() datasets = [] # Now create a dataset per partition for p, (partition, fragments) in enumerate(sorted(ds_cfg.items())): fragments = list(sorted(fragments)) column_arrays = defaultdict(list) fragment_rows = [f.metadata.num_rows for f in fragments] # Returns a dictionary of lists mapping fragments to partitions. partition_chunks = partition_chunking(p, fragment_rows, chunks) for pieces in partition_chunks.values(): chunk_fragments = [fragments[i] for i, _ in pieces] chunk_ranges = [r for _, r in pieces] chunk_metas = [f.metadata for f in chunk_fragments] rows = sum(end - start for start, end in chunk_ranges) # NOTE(JSKenyon): This assumes that the schema/fields are # consistent between fragments. This should be ok. exemplar_schema = chunk_metas[0].schema.to_arrow_schema() exemplar_fields = { n: exemplar_schema.field(n) for n in exemplar_schema.names } for column, field in column_iterator(exemplar_fields, columns): field_metadata = field.metadata[DASKMS_METADATA.encode()] field_metadata = json.loads(field_metadata) dims = tuple(field_metadata["dims"]) if isinstance(field.type, TensorType): shape = (rows, ) + field.type.shape else: shape = (rows, ) assert len(shape) == len(dims) dtype = field.type.to_pandas_dtype() meta = np.empty((0, ) * len(dims), dtype) new_axes = {d: s for d, s in zip(dims, shape)} read = da.blockwise(fragment_reader, dims, chunk_fragments, None, chunk_ranges, None, column, None, shape, None, dtype, None, adjust_chunks={"row": rows}, new_axes=new_axes, meta=meta) column_arrays[column].append((read, dims)) data_vars = {} for column, values in column_arrays.items(): arrays, array_dims = zip(*values) array_dims = set(array_dims) if not len(array_dims) == 1: raise ValueError(f"Inconsistent array dimensions " f"{array_dims} for {column}") data_vars[column] = (array_dims.pop(), da.concatenate(arrays)) attrs = dict(partition) attrs[DASKMS_PARTITION_KEY] = partition_schemas datasets.append(Dataset(data_vars, attrs=attrs)) return datasets
def test_bda_channelisation(): rs = np.random.RandomState(42) spw_id = (0, 1, 2) pol_id = (0, 1) nchan = np.array([4, 8, 16]) bandwidth = np.array([20.0, 40.0, 60.0]) ref_freq = np.array([100.0, 200.0, 500.0]) chan_freq = [ np.linspace(rf - bw / 2, rf + bw / 2, nc)[None, :] for rf, bw, nc in zip(ref_freq, bandwidth, nchan) ] chan_width = [ np.full(nc, bw / nc)[None, :] for nc, bw in zip(nchan, bandwidth) ] spw_id, pol_id = (np.array(a) for a in zip(*product(spw_id, pol_id))) ddid = np.array([5, 1, 3, 2, 0, 4, 1, 2, 5, 3]) time = np.linspace(5.03373334e+09, 5.03373362e+09, ddid.shape[0]) row_chunks = (4, 3, 1, 2) row_spws = spw_id[ddid] num_chan = nchan[row_spws] row_num_chans = np.array([rs.randint(1, num_chan[s]) for s in row_spws]) da_ddid = da.from_array(ddid, chunks=(row_chunks, )) da_num_chan = da.from_array( row_num_chans, chunks=(row_chunks), ) da_time = da.from_array(time, chunks=(row_chunks, )) out_ds = [ Dataset({ "DATA_DESC_ID": (("row", ), da_ddid), "TIME": (("row", ), da_time), "NUM_CHAN": (("row", ), da_num_chan), "DECORR_CHAN_WIDTH": (("row", ), da.zeros_like(time)) }) ] ddid_ds = Dataset({ "SPECTRAL_WINDOW_ID": (("row", ), spw_id), "POLARIZATION_ID": (("row", ), pol_id), }) spw_ds = [ Dataset({ "REF_FREQUENCY": (("row", ), da.from_array([rf], chunks=1)), "NUM_CHAN": (("row", ), da.from_array([nc], chunks=1)), "CHAN_FREQ": (("row", ), da.from_array(chan_freq[spw], chunks=(1, nc))), "CHAN_WIDTH": (("row", ), da.from_array(chan_width[spw], chunks=(1, nc))), "RESOLUTION": (("row", ), da.from_array(chan_width[spw], chunks=(1, nc))), "EFFECTIVE_BW": (("row", ), da.from_array(chan_width[spw], chunks=(1, nc))), "TOTAL_BANDWIDTH": (("row"), da.from_array([bw], chunks=1)) }) for spw, (rf, bw, nc) in enumerate(zip(ref_freq, bandwidth, nchan)) ] out_ds, spw_ds, ddid_ds = bda_average_spw(out_ds, ddid_ds, spw_ds) out_ds, spw_ds, ddid_ds = da.compute(out_ds, spw_ds, ddid_ds)
def xds_to_zarr(xds, store, columns=None, rechunk=False, **kwargs): """ Stores a dataset of list of datasets defined by `xds` in file location `store`. Parameters ---------- xds : Dataset or list of Datasets Data store : str or Path Path to store the data columns : list of str or str or None Columns to store. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. All coordinates associated with a specified column will be written automatically. rechunk : bool Controls whether dask arrays should be automatically rechunked to be consistent with existing on-disk zarr arrays while writing to disk. **kwargs : optional Returns ------- writes : Dataset A Dataset representing the write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_to_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") write_datasets = [] for di, ds in enumerate(xds): data_vars, coords = select_vars_and_coords(ds, columns) # Create a new ds which is consistent with what we want to write. ds = Dataset(data_vars, coords=coords, attrs=ds.attrs) ds, group = prepare_zarr_group(di, ds, store, rechunk=rechunk) data_vars = dict(_gen_writes(ds.data_vars, ds.chunks, group)) # Include coords in the write dataset so they're reified data_vars.update( dict(_gen_writes(ds.coords, ds.chunks, group, indirect_dims=True))) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } write_datasets.append(Dataset(data_vars, attrs=attrs)) return write_datasets
def xds_to_parquet(xds, path, columns=None): path, table = store_path_split(path) if not isinstance(path, Path): path = Path(path) columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") datasets = [] base_schema = ArrowSchema.from_datasets(xds) for ds_id, ds in enumerate(xds): arrow_schema = base_schema.with_attributes(ds) fragment = ParquetFragment(path / table, arrow_schema, ds_id) chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1) args = [chunk_ids, ("row", )] data_var_it = column_iterator(ds.data_vars, columns) coord_it = column_iterator(ds.coords, columns) for column, variable in itertools.chain(data_var_it, coord_it): if not isinstance(variable.data, da.Array): raise ValueError(f"Column {column} does not " f"contain a dask Array") if len(variable.dims[0]) == 0 or variable.dims[0] != "row": raise ValueError(f"Column {column} dimensions " f"{variable.dims} don't start with 'row'") args.extend((column, None, variable.data, variable.dims)) for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]): if len(chunk) != 1: raise ValueError(f"Chunking in {dim} is not yet " f"supported.") writes = da.blockwise(fragment.write, ("row", ), *args, align_arrays=False, adjust_chunks={"row": 1}, meta=np.empty((0, ), np.bool)) writes = inlined_array(writes, chunk_ids) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs)) return datasets
def xds_to_zarr(xds, store, columns=None): """ Stores a dataset of list of datasets defined by `xds` in file location `store`. Parameters ---------- xds : Dataset or list of Datasets Data store : str or Path Path to store the data columns : list of str or str or None Columns to store. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. Returns ------- writes : Dataset A Dataset representing the write operations """ store, table = store_path_split(store) if isinstance(store, Path): store = str(store) if not isinstance(store, str): raise TypeError(f"store '{store}' must be Path or str") columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") write_datasets = [] for di, ds in enumerate(xds): group = prepare_zarr_group(di, ds, store, table) write_args = (ds.chunks, columns, group) data_vars = dict(_gen_writes(ds.data_vars, *write_args)) # Include coords in the write dataset so they're reified data_vars.update(dict(_gen_writes(ds.coords, *write_args))) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } write_datasets.append(Dataset(data_vars, attrs=attrs)) return write_datasets
def xds_from_zarr(store, columns=None, chunks=None, **kwargs): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset **kwargs: optional Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY # expensive if the metadata has not been consolidated. zc.consolidate_metadata(store.map) table_path = store.table if store.table else "MAIN" table_group = zarr.open_consolidated(store.map)[table_path] for g, (group_name, group) in enumerate(sorted(table_group.groups(), key=group_sortkey)): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: group_chunks.update(chunks[-1]) # Reuse last chunking. pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets