Exemple #1
0
def test_inlined_array():
    A = da.ones((10, 10), chunks=(2, 2), dtype=np.float64)
    B = da.full((10, 10), np.float64(2), chunks=(2, 2))
    C = A + B
    E = C + 1

    D = inlined_array(C)
    assert len(C.__dask_graph__().layers) == 3
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten(D.__dask_keys__()))
    assert_array_equal(D, C)

    D = inlined_array(C, [A, B])
    assert len(D.__dask_graph__().layers) == 1
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten(D.__dask_keys__()))
    assert_array_equal(D, C)

    D = inlined_array(C, [A])
    assert len(D.__dask_graph__().layers) == 2
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B]]))
    assert_array_equal(D, C)

    D = inlined_array(C, [B])
    assert len(D.__dask_graph__().layers) == 2
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, A]]))
    assert_array_equal(D, C)

    D = inlined_array(E, [A])
    assert len(D.__dask_graph__().layers) == 3
    assert D.name == E.name
    assert D.name in D.__dask_graph__().layers
    assert B.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert C.name in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B, C]]))
    assert_array_equal(D, E)
Exemple #2
0
def _gen_writes(variables, chunks, factory, indirect_dims=False):
    for name, var in variables.items():
        if isinstance(var.data, da.Array):
            ext_args = extent_args(var.dims, var.chunks)
            var_data = var.data
        elif isinstance(var.data, np.ndarray):
            try:
                var_chunks = tuple(chunks[d] for d in var.dims)
            except KeyError:
                var_chunks = tuple((s, ) for s in var.shape)
            ext_args = extent_args(var.dims, var_chunks)
            var_data = da.from_array(var.data,
                                     chunks=var_chunks,
                                     inline_array=True,
                                     name=False)
        else:
            raise NotImplementedError(f"Writing {type(var.data)} "
                                      f"unsupported")

        if var_data.nbytes == 0:
            continue

        token_name = (f"write~{name}-"
                      f"{tokenize(var_data, name, factory, *ext_args)}")

        write = da.blockwise(zarr_setter,
                             var.dims,
                             var_data,
                             var.dims,
                             name,
                             None,
                             factory,
                             None,
                             *ext_args,
                             adjust_chunks={d: 1
                                            for d in var.dims},
                             concatenate=False,
                             name=token_name,
                             meta=np.empty((1, ) * len(var.dims), bool))
        write = inlined_array(write, ext_args[::2])

        # Alter the dimension names to preserve laziness on coordinates.
        dims = [f"_{d}_" for d in var.dims] if indirect_dims else var.dims

        yield name, (dims, write, var.attrs)
Exemple #3
0
def _gen_writes(variables, chunks, columns, factory):
    for name, var in column_iterator(variables, columns):
        if isinstance(var.data, da.Array):
            ext_args = extent_args(var.dims, var.chunks)
            var_data = var.data
        elif isinstance(var.data, np.ndarray):
            var_chunks = tuple(chunks[d] for d in var.dims)
            ext_args = extent_args(var.dims, var_chunks)
            var_data = da.from_array(
                var.data,
                chunks=var_chunks,
                inline_array=True,
                name=False,
            )
        else:
            raise NotImplementedError(f"Writing {type(var.data)} "
                                      f"unsupported")

        if var.data.nbytes == 0:
            continue

        token_name = (f"write~{name}-"
                      f"{tokenize(var_data, name, factory, *ext_args)}")

        write = da.blockwise(zarr_setter,
                             var.dims,
                             var_data,
                             var.dims,
                             name,
                             None,
                             factory,
                             None,
                             *ext_args,
                             adjust_chunks={d: 1
                                            for d in var.dims},
                             concatenate=False,
                             name=token_name,
                             meta=np.empty((1, ) * len(var.dims), np.bool))
        write = inlined_array(write, ext_args[::2])

        yield name, (var.dims, write, var.attrs)
Exemple #4
0
def _write_datasets(table, table_proxy, datasets, columns, descriptor,
                    table_keywords, column_keywords):
    _, table_name, subtable = table_path_split(table)
    table_name = '::'.join((table_name, subtable)) if subtable else table_name
    row_orders = []

    # Put table and column keywords
    table_proxy.submit(_put_keywords, WRITELOCK, table_keywords,
                       column_keywords).result()

    # Sort datasets on (not has "ROWID", index) such that
    # datasets with ROWID's are handled first, while
    # those without (which imply appends to the MS)
    # are handled last
    sorted_datasets = sorted(enumerate(datasets),
                             key=lambda t:
                             ("ROWID" not in t[1].data_vars, t[0]))

    # Establish row orders for each dataset
    for di, ds in sorted_datasets:
        try:
            rowid = ds.ROWID.data
        except AttributeError:
            # Add operation
            # No ROWID's, assume they're missing from the table
            # and remaining datasets. Generate addrows
            # NOTE(sjperkins)
            # This could be somewhat brittle, but exists to
            # update MS empty subtables once they've been
            # created along with the main MS by a call to default_ms.
            # Users could also it to append rows to an existing table.
            # An xds_append_to_table may be a better solution...
            last_datasets = datasets[di:]
            last_row_orders = add_row_order_factory(table_proxy, last_datasets)

            # We don't inline the row ordering if it is derived
            # from the row sizes of provided arrays.
            # The range of possible dependencies are far too large to inline
            row_orders.extend([(False, lro) for lro in last_row_orders])
            # We have established row orders for all datasets
            # at this point, quit the loop
            break
        else:
            # Update operation
            # Generate row orderings from existing row IDs
            row_order = rowid.map_blocks(row_run_factory,
                                         sort_dir="write",
                                         dtype=np.object)

            # TODO(sjperkins)
            # There's an assumption here that rowid is an
            # operation with minimal dependencies
            # (i.e. derived from xds_from_{ms, table})
            # Caching flattens the graph into a single layer
            if len(row_order.__dask_graph__().layers) > 1:
                log.warning("Caching an update row ordering "
                            "with more than one layer")

            row_order = cached_array(row_order)
            # Inline the row ordering in the graph
            row_orders.append((True, row_order))

    assert len(row_orders) == len(datasets)

    datasets = []

    for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders):
        # Hold the variables representing array writes
        write_vars = {}

        # Generate a dask array for each column
        for column in columns:
            try:
                variable = ds.data_vars[column]
            except KeyError:
                log.warning("Ignoring '%s' not present "
                            "on dataset %d" % (column, di))
                continue
            else:
                full_dims = variable.dims
                array = variable.data

            if not isinstance(array, da.Array):
                raise TypeError("%s on dataset %d is not a dask Array "
                                "but a %s" % (column, di, type(array)))

            args = [row_order, ("row", )]

            # We only need to pass in dimension extent arrays if
            # there is more than one chunk in any of the non-row columns.
            # In that case, we can putcol, otherwise putcolslice is required
            if not all(len(c) == 1 for c in array.chunks[1:]):
                # Add extent arrays
                for d, c in zip(full_dims[1:], array.chunks[1:]):
                    args.append(dim_extents_array(d, c))
                    args.append((d, ))

            # Add other variables
            args.extend([table_proxy, None, column, None, array, full_dims])

            # Name of the dask array representing this column
            token = dask.base.tokenize(di, args)
            name = "-".join((table_name, 'write', column, token))

            write_col = da.blockwise(
                putter_wrapper,
                full_dims,
                *args,
                # All dims shrink to 1,
                # a single bool is returned
                adjust_chunks={d: 1
                               for d in full_dims},
                name=name,
                align_arrays=False,
                dtype=np.bool)

            if inline:
                write_col = inlined_array(write_col, [row_order])

            write_vars[column] = (full_dims, write_col)

        # Append a dataset with the write operations
        datasets.append(Dataset(write_vars))

    # Return an empty dataset
    if len(datasets) == 0:
        return Dataset({})
    # Return singleton
    elif len(datasets) == 1:
        return datasets[0]

    return datasets
Exemple #5
0
def _dataset_variable_factory(table_proxy, table_schema, select_cols,
                              exemplar_row, orders, chunks, array_prefix):
    """
    Returns a dictionary of dask arrays representing
    a series of getcols on the appropriate table.

    Produces variables for inclusion in a Dataset.

    Parameters
    ----------
    table_proxy : :class:`daskms.table_proxy.TableProxy`
        Table proxy object
    table_schema : dict
        Table schema
    select_cols : list of strings
        List of columns to return
    exemplar_row : int
        row id used to possibly extract an exemplar array in
        order to determine the column shape and dtype attributes
    orders : tuple of :class:`dask.array.Array`
        A (sorted_rows, row_runs) tuple, specifying the
        appropriate rows to extract from the table.
    chunks : dict
        Chunking strategy for the dataset.
    array_prefix : str
        dask array string prefix

    Returns
    -------
    dict
        A dictionary looking like :code:`{column: (arrays, dims)}`.
    """

    sorted_rows, row_runs = orders
    dataset_vars = {"ROWID": (("row", ), sorted_rows)}

    for column in select_cols:
        try:
            meta = column_metadata(column, table_proxy, table_schema, chunks,
                                   exemplar_row)
        except ColumnMetadataError as e:
            exc_info = logging.DEBUG >= log.getEffectiveLevel()
            log.warning("Ignoring '%s': %s", column, e, exc_info=exc_info)
            continue

        full_dims = ("row", ) + meta.dims
        args = [row_runs, ("row", )]

        # We only need to pass in dimension extent arrays if
        # there is more than one chunk in any of the non-row columns.
        # In that case, we can getcol, otherwise getcolslice is required
        if not all(len(c) == 1 for c in meta.chunks):
            for d, c in zip(meta.dims, meta.chunks):
                # Create an array describing the dimension chunk extents
                args.append(dim_extents_array(d, c))
                args.append((d, ))

            new_axes = {}
        else:
            # We need to inform blockwise about the size of our
            # new dimensions as no arrays with them are supplied
            new_axes = {d: s for d, s in zip(meta.dims, meta.shape)}

        # Add other variables
        args.extend([
            table_proxy, None, column, None, meta.shape, None, meta.dtype, None
        ])

        # Name of the dask array representing this column
        token = dask.base.tokenize(args)
        name = "-".join((array_prefix, column, token))

        # Construct the array
        dask_array = da.blockwise(getter_wrapper,
                                  full_dims,
                                  *args,
                                  name=name,
                                  new_axes=new_axes,
                                  dtype=meta.dtype)

        dask_array = inlined_array(dask_array)

        # Assign into variable and dimension dataset
        dataset_vars[column] = (full_dims, dask_array)

    return dataset_vars
Exemple #6
0
def xds_to_parquet(xds, path, columns=None):
    path, table = store_path_split(path)

    if not isinstance(path, Path):
        path = Path(path)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    datasets = []
    base_schema = ArrowSchema.from_datasets(xds)

    for ds_id, ds in enumerate(xds):
        arrow_schema = base_schema.with_attributes(ds)
        fragment = ParquetFragment(path / table, arrow_schema, ds_id)
        chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1)
        args = [chunk_ids, ("row", )]

        data_var_it = column_iterator(ds.data_vars, columns)
        coord_it = column_iterator(ds.coords, columns)

        for column, variable in itertools.chain(data_var_it, coord_it):
            if not isinstance(variable.data, da.Array):
                raise ValueError(f"Column {column} does not "
                                 f"contain a dask Array")

            if len(variable.dims[0]) == 0 or variable.dims[0] != "row":
                raise ValueError(f"Column {column} dimensions "
                                 f"{variable.dims} don't start with 'row'")

            args.extend((column, None, variable.data, variable.dims))

            for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]):
                if len(chunk) != 1:
                    raise ValueError(f"Chunking in {dim} is not yet "
                                     f"supported.")

        writes = da.blockwise(fragment.write, ("row", ),
                              *args,
                              align_arrays=False,
                              adjust_chunks={"row": 1},
                              meta=np.empty((0, ), np.bool))

        writes = inlined_array(writes, chunk_ids)

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs))

    return datasets
Exemple #7
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
Exemple #8
0
def xds_from_zarr(store, columns=None, chunks=None):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """
    store, table = store_path_split(store)

    if isinstance(store, Path):
        store = str(store)

    if not isinstance(store, str):
        raise TypeError("store must be a Path, str")

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    table_group = zarr.open(store)[table]

    for g, (group_name, group) in enumerate(sorted(table_group.groups())):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
Exemple #9
0
def xds_to_parquet(xds, store, columns=None, **kwargs):
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (str, Path)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_to_parquet: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    datasets = []
    base_schema = ArrowSchema.from_datasets(xds)

    for ds_id, ds in enumerate(xds):
        arrow_schema = base_schema.with_attributes(ds)
        fragment = ParquetFragment(store, store.table, arrow_schema, ds_id)
        chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1)
        args = [chunk_ids, ("row", )]

        data_var_it = column_iterator(ds.data_vars, columns)
        coord_it = column_iterator(ds.coords, columns)

        for column, variable in itertools.chain(data_var_it, coord_it):
            if not isinstance(variable.data, da.Array):
                raise ValueError(f"Column {column} does not "
                                 f"contain a dask Array")

            if len(variable.dims[0]) == 0 or variable.dims[0] != "row":
                raise ValueError(f"Column {column} dimensions "
                                 f"{variable.dims} don't start with 'row'")

            args.extend((column, None, variable.data, variable.dims))

            for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]):
                if len(chunk) != 1:
                    raise ValueError(f"Chunking in {dim} is not yet "
                                     f"supported.")

        writes = da.blockwise(fragment.write, ("row", ),
                              *args,
                              align_arrays=False,
                              adjust_chunks={"row": 1},
                              meta=np.empty((0, ), bool))

        writes = inlined_array(writes, chunk_ids)

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs))

    return datasets