def test_inlined_array(): A = da.ones((10, 10), chunks=(2, 2), dtype=np.float64) B = da.full((10, 10), np.float64(2), chunks=(2, 2)) C = A + B E = C + 1 D = inlined_array(C) assert len(C.__dask_graph__().layers) == 3 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten(D.__dask_keys__())) assert_array_equal(D, C) D = inlined_array(C, [A, B]) assert len(D.__dask_graph__().layers) == 1 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten(D.__dask_keys__())) assert_array_equal(D, C) D = inlined_array(C, [A]) assert len(D.__dask_graph__().layers) == 2 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B]])) assert_array_equal(D, C) D = inlined_array(C, [B]) assert len(D.__dask_graph__().layers) == 2 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, A]])) assert_array_equal(D, C) D = inlined_array(E, [A]) assert len(D.__dask_graph__().layers) == 3 assert D.name == E.name assert D.name in D.__dask_graph__().layers assert B.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert C.name in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B, C]])) assert_array_equal(D, E)
def _gen_writes(variables, chunks, factory, indirect_dims=False): for name, var in variables.items(): if isinstance(var.data, da.Array): ext_args = extent_args(var.dims, var.chunks) var_data = var.data elif isinstance(var.data, np.ndarray): try: var_chunks = tuple(chunks[d] for d in var.dims) except KeyError: var_chunks = tuple((s, ) for s in var.shape) ext_args = extent_args(var.dims, var_chunks) var_data = da.from_array(var.data, chunks=var_chunks, inline_array=True, name=False) else: raise NotImplementedError(f"Writing {type(var.data)} " f"unsupported") if var_data.nbytes == 0: continue token_name = (f"write~{name}-" f"{tokenize(var_data, name, factory, *ext_args)}") write = da.blockwise(zarr_setter, var.dims, var_data, var.dims, name, None, factory, None, *ext_args, adjust_chunks={d: 1 for d in var.dims}, concatenate=False, name=token_name, meta=np.empty((1, ) * len(var.dims), bool)) write = inlined_array(write, ext_args[::2]) # Alter the dimension names to preserve laziness on coordinates. dims = [f"_{d}_" for d in var.dims] if indirect_dims else var.dims yield name, (dims, write, var.attrs)
def _gen_writes(variables, chunks, columns, factory): for name, var in column_iterator(variables, columns): if isinstance(var.data, da.Array): ext_args = extent_args(var.dims, var.chunks) var_data = var.data elif isinstance(var.data, np.ndarray): var_chunks = tuple(chunks[d] for d in var.dims) ext_args = extent_args(var.dims, var_chunks) var_data = da.from_array( var.data, chunks=var_chunks, inline_array=True, name=False, ) else: raise NotImplementedError(f"Writing {type(var.data)} " f"unsupported") if var.data.nbytes == 0: continue token_name = (f"write~{name}-" f"{tokenize(var_data, name, factory, *ext_args)}") write = da.blockwise(zarr_setter, var.dims, var_data, var.dims, name, None, factory, None, *ext_args, adjust_chunks={d: 1 for d in var.dims}, concatenate=False, name=token_name, meta=np.empty((1, ) * len(var.dims), np.bool)) write = inlined_array(write, ext_args[::2]) yield name, (var.dims, write, var.attrs)
def _write_datasets(table, table_proxy, datasets, columns, descriptor, table_keywords, column_keywords): _, table_name, subtable = table_path_split(table) table_name = '::'.join((table_name, subtable)) if subtable else table_name row_orders = [] # Put table and column keywords table_proxy.submit(_put_keywords, WRITELOCK, table_keywords, column_keywords).result() # Sort datasets on (not has "ROWID", index) such that # datasets with ROWID's are handled first, while # those without (which imply appends to the MS) # are handled last sorted_datasets = sorted(enumerate(datasets), key=lambda t: ("ROWID" not in t[1].data_vars, t[0])) # Establish row orders for each dataset for di, ds in sorted_datasets: try: rowid = ds.ROWID.data except AttributeError: # Add operation # No ROWID's, assume they're missing from the table # and remaining datasets. Generate addrows # NOTE(sjperkins) # This could be somewhat brittle, but exists to # update MS empty subtables once they've been # created along with the main MS by a call to default_ms. # Users could also it to append rows to an existing table. # An xds_append_to_table may be a better solution... last_datasets = datasets[di:] last_row_orders = add_row_order_factory(table_proxy, last_datasets) # We don't inline the row ordering if it is derived # from the row sizes of provided arrays. # The range of possible dependencies are far too large to inline row_orders.extend([(False, lro) for lro in last_row_orders]) # We have established row orders for all datasets # at this point, quit the loop break else: # Update operation # Generate row orderings from existing row IDs row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) # TODO(sjperkins) # There's an assumption here that rowid is an # operation with minimal dependencies # (i.e. derived from xds_from_{ms, table}) # Caching flattens the graph into a single layer if len(row_order.__dask_graph__().layers) > 1: log.warning("Caching an update row ordering " "with more than one layer") row_order = cached_array(row_order) # Inline the row ordering in the graph row_orders.append((True, row_order)) assert len(row_orders) == len(datasets) datasets = [] for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders): # Hold the variables representing array writes write_vars = {} # Generate a dask array for each column for column in columns: try: variable = ds.data_vars[column] except KeyError: log.warning("Ignoring '%s' not present " "on dataset %d" % (column, di)) continue else: full_dims = variable.dims array = variable.data if not isinstance(array, da.Array): raise TypeError("%s on dataset %d is not a dask Array " "but a %s" % (column, di, type(array))) args = [row_order, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can putcol, otherwise putcolslice is required if not all(len(c) == 1 for c in array.chunks[1:]): # Add extent arrays for d, c in zip(full_dims[1:], array.chunks[1:]): args.append(dim_extents_array(d, c)) args.append((d, )) # Add other variables args.extend([table_proxy, None, column, None, array, full_dims]) # Name of the dask array representing this column token = dask.base.tokenize(di, args) name = "-".join((table_name, 'write', column, token)) write_col = da.blockwise( putter_wrapper, full_dims, *args, # All dims shrink to 1, # a single bool is returned adjust_chunks={d: 1 for d in full_dims}, name=name, align_arrays=False, dtype=np.bool) if inline: write_col = inlined_array(write_col, [row_order]) write_vars[column] = (full_dims, write_col) # Append a dataset with the write operations datasets.append(Dataset(write_vars)) # Return an empty dataset if len(datasets) == 0: return Dataset({}) # Return singleton elif len(datasets) == 1: return datasets[0] return datasets
def _dataset_variable_factory(table_proxy, table_schema, select_cols, exemplar_row, orders, chunks, array_prefix): """ Returns a dictionary of dask arrays representing a series of getcols on the appropriate table. Produces variables for inclusion in a Dataset. Parameters ---------- table_proxy : :class:`daskms.table_proxy.TableProxy` Table proxy object table_schema : dict Table schema select_cols : list of strings List of columns to return exemplar_row : int row id used to possibly extract an exemplar array in order to determine the column shape and dtype attributes orders : tuple of :class:`dask.array.Array` A (sorted_rows, row_runs) tuple, specifying the appropriate rows to extract from the table. chunks : dict Chunking strategy for the dataset. array_prefix : str dask array string prefix Returns ------- dict A dictionary looking like :code:`{column: (arrays, dims)}`. """ sorted_rows, row_runs = orders dataset_vars = {"ROWID": (("row", ), sorted_rows)} for column in select_cols: try: meta = column_metadata(column, table_proxy, table_schema, chunks, exemplar_row) except ColumnMetadataError as e: exc_info = logging.DEBUG >= log.getEffectiveLevel() log.warning("Ignoring '%s': %s", column, e, exc_info=exc_info) continue full_dims = ("row", ) + meta.dims args = [row_runs, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can getcol, otherwise getcolslice is required if not all(len(c) == 1 for c in meta.chunks): for d, c in zip(meta.dims, meta.chunks): # Create an array describing the dimension chunk extents args.append(dim_extents_array(d, c)) args.append((d, )) new_axes = {} else: # We need to inform blockwise about the size of our # new dimensions as no arrays with them are supplied new_axes = {d: s for d, s in zip(meta.dims, meta.shape)} # Add other variables args.extend([ table_proxy, None, column, None, meta.shape, None, meta.dtype, None ]) # Name of the dask array representing this column token = dask.base.tokenize(args) name = "-".join((array_prefix, column, token)) # Construct the array dask_array = da.blockwise(getter_wrapper, full_dims, *args, name=name, new_axes=new_axes, dtype=meta.dtype) dask_array = inlined_array(dask_array) # Assign into variable and dimension dataset dataset_vars[column] = (full_dims, dask_array) return dataset_vars
def xds_to_parquet(xds, path, columns=None): path, table = store_path_split(path) if not isinstance(path, Path): path = Path(path) columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") datasets = [] base_schema = ArrowSchema.from_datasets(xds) for ds_id, ds in enumerate(xds): arrow_schema = base_schema.with_attributes(ds) fragment = ParquetFragment(path / table, arrow_schema, ds_id) chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1) args = [chunk_ids, ("row", )] data_var_it = column_iterator(ds.data_vars, columns) coord_it = column_iterator(ds.coords, columns) for column, variable in itertools.chain(data_var_it, coord_it): if not isinstance(variable.data, da.Array): raise ValueError(f"Column {column} does not " f"contain a dask Array") if len(variable.dims[0]) == 0 or variable.dims[0] != "row": raise ValueError(f"Column {column} dimensions " f"{variable.dims} don't start with 'row'") args.extend((column, None, variable.data, variable.dims)) for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]): if len(chunk) != 1: raise ValueError(f"Chunking in {dim} is not yet " f"supported.") writes = da.blockwise(fragment.write, ("row", ), *args, align_arrays=False, adjust_chunks={"row": 1}, meta=np.empty((0, ), np.bool)) writes = inlined_array(writes, chunk_ids) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs)) return datasets
def xds_from_zarr(store, columns=None, chunks=None, **kwargs): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset **kwargs: optional Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY # expensive if the metadata has not been consolidated. zc.consolidate_metadata(store.map) table_path = store.table if store.table else "MAIN" table_group = zarr.open_consolidated(store.map)[table_path] for g, (group_name, group) in enumerate(sorted(table_group.groups(), key=group_sortkey)): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: group_chunks.update(chunks[-1]) # Reuse last chunking. pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets
def xds_from_zarr(store, columns=None, chunks=None): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ store, table = store_path_split(store) if isinstance(store, Path): store = str(store) if not isinstance(store, str): raise TypeError("store must be a Path, str") columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] table_group = zarr.open(store)[table] for g, (group_name, group) in enumerate(sorted(table_group.groups())): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets
def xds_to_parquet(xds, store, columns=None, **kwargs): if isinstance(store, DaskMSStore): pass elif isinstance(store, (str, Path)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_to_parquet: {kwargs}", UserWarning) columns = promote_columns(columns) if isinstance(xds, Dataset): xds = [xds] elif isinstance(xds, (tuple, list)): if not all(isinstance(ds, Dataset) for ds in xds): raise TypeError("xds must be a Dataset or list of Datasets") else: raise TypeError("xds must be a Dataset or list of Datasets") datasets = [] base_schema = ArrowSchema.from_datasets(xds) for ds_id, ds in enumerate(xds): arrow_schema = base_schema.with_attributes(ds) fragment = ParquetFragment(store, store.table, arrow_schema, ds_id) chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1) args = [chunk_ids, ("row", )] data_var_it = column_iterator(ds.data_vars, columns) coord_it = column_iterator(ds.coords, columns) for column, variable in itertools.chain(data_var_it, coord_it): if not isinstance(variable.data, da.Array): raise ValueError(f"Column {column} does not " f"contain a dask Array") if len(variable.dims[0]) == 0 or variable.dims[0] != "row": raise ValueError(f"Column {column} dimensions " f"{variable.dims} don't start with 'row'") args.extend((column, None, variable.data, variable.dims)) for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]): if len(chunk) != 1: raise ValueError(f"Chunking in {dim} is not yet " f"supported.") writes = da.blockwise(fragment.write, ("row", ), *args, align_arrays=False, adjust_chunks={"row": 1}, meta=np.empty((0, ), bool)) writes = inlined_array(writes, chunk_ids) # Transfer any partition information over to the write dataset partition = ds.attrs.get(DASKMS_PARTITION_KEY, False) if not partition: attrs = None else: attrs = { DASKMS_PARTITION_KEY: partition, **{k: getattr(ds, k) for k, _ in partition} } datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs)) return datasets