コード例 #1
0
ファイル: test_dataset.py プロジェクト: smasoka/dask-ms
def test_dataset_create_table(tmp_path, dataset_chunks, dtype):
    datasets = []
    names = []
    datas = []
    row_sum = 0

    for chunks in dataset_chunks:
        shapes = {k: sum(c) for k, c in chunks.items()}
        row_sum += shapes['row']

        # Make some visibilities
        dims = ("row", "chan", "corr")
        shape = tuple(shapes[d] for d in dims)
        data_chunks = tuple(chunks[d] for d in dims)
        data = da.random.random(shape, chunks=data_chunks).astype(dtype)
        data_var = Variable(dims, data, {})

        # Make some string names
        dims = ("row", )
        shape = tuple(shapes[d] for d in dims)
        str_chunks = tuple(chunks[d] for d in dims)
        np_str_array = np.asarray(["BOB"] * shape[0], dtype=np.object)
        da_str_array = da.from_array(np_str_array, chunks=str_chunks)
        str_array_var = Variable(dims, da_str_array, {})

        datasets.append(Dataset({"DATA": data_var, "NAMES": str_array_var}))
        datas.append(data)
        names.extend(np_str_array.tolist())

    freq = da.linspace(.856e9, 2 * .856e9, 64, chunks=16)
    sub_datasets = [Dataset({"FREQ": (("row", "chan"), freq[None, :])})]

    # Write the data to new tables
    table_name = os.path.join(str(tmp_path), 'test.table')
    writes = write_datasets(table_name, datasets, ["DATA", "NAMES"])
    subt_writes = write_datasets(table_name + "::SPW", sub_datasets, ["FREQ"])
    dask.compute(writes, subt_writes)

    # Check written data
    with pt.table(table_name, readonly=True, lockoptions='auto',
                  ack=False) as T:
        assert row_sum == T.nrows()
        assert_array_equal(T.getcol("DATA"), np.concatenate(datas))
        assert_array_equal(T.getcol("NAMES"), names)

    # Sub-table correctly linked and populated
    with pt.table(table_name + "::SPW",
                  readonly=True,
                  lockoptions='auto',
                  ack=False) as T:
        assert T.nrows() == 1
        assert_array_equal(T.getcol("FREQ")[0], freq)
コード例 #2
0
ファイル: test_dataset.py プロジェクト: ska-sa/dask-ms
def test_write_dict_data(tmp_path, chunks, dtype):
    rs = np.random.RandomState(42)
    row_sum = 0

    def _vis_factory(chan, corr):
        # Variably sized-channels per row, as in BDA data
        nchan = rs.randint(chan)
        return (rs.normal(size=(1, nchan, corr)) +
                rs.normal(size=(1, nchan, corr))*1j)

    shapes = {k: sum(c) for k, c in chunks.items()}
    row_sum += shapes['row']

    # assert len(chunks['chan']) == 1
    assert len(chunks['corr']) == 1

    # Make some visibilities
    dims = ("row", "chan", "corr")
    row, chan, corr = (shapes[d] for d in dims)
    name = "vis-data-" + uuid.uuid4().hex

    nchunks = (len(chunks[d]) for d in dims)
    keys = product((name,), *(range(c) for c in nchunks))
    chunk_sizes = product(*(chunks[d] for d in dims))

    layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr)
                 for i in range(r)}
             for k, (r, _, _) in zip(keys, chunk_sizes)}

    hlg = HighLevelGraph.from_collections(name, layer, [])
    chunks = tuple(chunks[d] for d in dims)
    meta = np.empty((0,)*len(chunks), dtype=np.complex128)
    vis = da.Array(hlg, name, chunks, meta=meta)
    ds = Dataset({"DATA": (dims, vis)})

    table_name = os.path.join(str(tmp_path), 'test.table')
    writes, table_proxy = write_datasets(table_name, ds, ["DATA"],
                                         table_proxy=True,
                                         # No fixed shape columns
                                         descriptor="ms(False)")

    dask.compute(writes)

    data = table_proxy.getvarcol("DATA").result()

    # First row chunk
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5'])

    # Second row chunk
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8'])

    # Third row chunk
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9'])
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
コード例 #3
0
def test_xds_to_zarr_coords(tmp_path_factory):
    zarr_store = tmp_path_factory.mktemp("zarr_coords") / "test.zarr"

    data = da.ones((100, 16, 4), chunks=(10, 4, 1), dtype=np.complex64)
    rowid = da.arange(100, chunks=10)

    data_vars = {"DATA": (("row", "chan", "corr"), data)}
    coords = {
        "ROWID": (("row",), rowid),
        "chan": (("chan",), np.arange(16)),
        "foo": (("foo",), np.arange(4)),
    }

    ds = [Dataset(data_vars, coords=coords)]

    writes = xds_to_zarr(ds, zarr_store)
    dask.compute(writes)

    rds = xds_from_zarr(zarr_store)
    assert len(ds) == len(rds)

    for ods, nds in zip(ds, rds):
        for c, v in ods.data_vars.items():
            assert_array_equal(v.data, getattr(nds, c).data)

        for c, v in ods.coords.items():
            assert_array_equal(v.data, getattr(nds, c).data)
コード例 #4
0
def test_basic_roundtrip(tmp_path):

    path = tmp_path / "test.zarr"

    # We need >10 datasets to be sure roundtripping is consistent.
    xdsl = [Dataset({'x': (('y',), da.ones(i))}) for i in range(1, 12)]
    dask.compute(xds_to_zarr(xdsl, path))

    xdsl = xds_from_zarr(path)
    dask.compute(xds_to_zarr(xdsl, path))
コード例 #5
0
ファイル: reads.py プロジェクト: ska-sa/dask-ms
    def _group_datasets(self, table_proxy, groups, exemplar_rows, orders):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = '/'.join((t, s)) if s else t
        table_schema = self._table_schema()

        datasets = []
        group_ids = list(zip(*groups))

        assert len(group_ids) == len(orders)

        # Select columns, excluding grouping columns
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        select_cols -= set(self.group_cols)

        # Create a dataset for each group
        it = enumerate(zip(group_ids, exemplar_rows, orders))

        for g, (group_id, exemplar_row, order) in it:
            # Extract group chunks
            try:
                group_chunks = self.chunks[g]  # Get group chunking strategy
            except IndexError:
                group_chunks = self.chunks[-1]  # Re-use last group's chunks

            # Prefix dataset
            gid_str = ",".join(str(gid) for gid in group_id)
            array_suffix = f"[{gid_str}]-{short_table_name}"

            # Create dataset variables
            group_var_dims = _dataset_variable_factory(
                table_proxy, table_schema, select_cols, exemplar_row, order,
                group_chunks, array_suffix)

            # Extract ROWID
            try:
                rowid = group_var_dims.pop("ROWID")
            except KeyError:
                coords = None
            else:
                coords = {"ROWID": rowid}

            # Assign values for the dataset's grouping columns
            # as attributes
            partitions = tuple(
                (c, g.dtype.name) for c, g in zip(self.group_cols, group_id))
            attrs = {DASKMS_PARTITION_KEY: partitions}

            # Use python types which are json serializable
            group_id = [gid.item() for gid in group_id]
            attrs.update(zip(self.group_cols, group_id))

            datasets.append(Dataset(group_var_dims, attrs=attrs,
                                    coords=coords))

        return datasets
コード例 #6
0
    def _group_datasets(self, groups, exemplar_rows, orders):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = '/'.join((t, s)) if s else t
        table_proxy = self._table_proxy()
        table_schema = self._table_schema()

        datasets = []
        group_ids = list(zip(*groups))

        assert len(group_ids) == len(orders)

        # Select columns, excluding grouping columns
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        select_cols -= set(self.group_cols)

        # Create a dataset for each group
        it = enumerate(zip(group_ids, exemplar_rows, orders))

        for g, (group_id, exemplar_row, order) in it:
            # Extract group chunks
            try:
                group_chunks = self.chunks[g]  # Get group chunking strategy
            except IndexError:
                group_chunks = self.chunks[-1]  # Re-use last group's chunks

            # Prefix d
            gid_str = ",".join(str(gid) for gid in group_id)
            array_prefix = "%s-[%s]" % (short_table_name, gid_str)

            # Create dataset variables
            group_var_dims = _dataset_variable_factory(
                table_proxy, table_schema, select_cols, exemplar_row, order,
                group_chunks, array_prefix)

            # Extract ROWID
            try:
                rowid = group_var_dims.pop("ROWID")
            except KeyError:
                coords = None
            else:
                coords = {"ROWID": rowid}

            # Assign values for the dataset's grouping columns
            # as attributes
            attrs = dict(zip(self.group_cols, group_id))

            datasets.append(Dataset(group_var_dims, attrs=attrs,
                                    coords=coords))

        return datasets
コード例 #7
0
def test_zarr_string_array(tmp_path_factory):
    zarr_store = tmp_path_factory.mktemp("string-arrays") / "test.zarr"

    data = ["hello", "this", "strange new world",
            "full of", "interesting", "stuff"]
    data = np.array(data, dtype=object).reshape(3, 2)
    data = da.from_array(data, chunks=((2, 1), (1, 1)))

    datasets = [Dataset({"DATA": (("x", "y"), data)})]
    writes = xds_to_zarr(datasets, zarr_store)
    dask.compute(writes)

    new_datasets = xds_from_zarr(zarr_store)

    assert len(new_datasets) == len(datasets)

    for nds, ds in zip(new_datasets, datasets):
        assert_array_equal(nds.DATA.data, ds.DATA.data)
コード例 #8
0
ファイル: reads.py プロジェクト: gitter-badger/dask-ms
    def _single_dataset(self, table_proxy, orders, exemplar_row=0):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = "/".join((t, s)) if s else t

        table_schema = self._table_schema()
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        variables = _dataset_variable_factory(table_proxy, table_schema,
                                              select_cols, exemplar_row,
                                              orders, self.chunks[0],
                                              short_table_name)

        try:
            rowid = variables.pop("ROWID")
        except KeyError:
            coords = None
        else:
            coords = {"ROWID": rowid}

        return Dataset(variables, coords=coords)
コード例 #9
0
ファイル: test_parquet.py プロジェクト: ska-sa/dask-ms
def test_xds_to_parquet_string(tmp_path_factory):
    store = tmp_path_factory.mktemp("parquet_store") / "string-dataset.parquet"

    datasets = []

    for i in range(3):
        names = random.choices([f"foo-{i}", f"bar-{i}", f"qux-{i}"], k=10)
        names = np.asarray(names, dtype=object)
        chunks = sorted([1, 2, 3, 4], key=lambda *a: random.random())
        names = da.from_array(names, chunks=chunks)
        datasets.append(Dataset({"NAME": (("row", ), names)}))

    writes = xds_to_parquet(datasets, store)
    dask.compute(writes)

    parquet_datasets = xds_from_parquet(store)
    assert len(datasets) == len(parquet_datasets)

    for ds, pq_ds in zip(datasets, parquet_datasets):
        assert_array_equal(ds.NAME.data, pq_ds.NAME.data)
コード例 #10
0
ファイル: test_dataset.py プロジェクト: smasoka/dask-ms
def test_dataset_multidim_string_column(tmp_path, chunks):
    row = sum(chunks['row'])

    name_list = [["X-%d" % i, "Y-%d" % i, "Z-%d" % i] for i in range(row)]
    np_names = np.array(name_list, dtype=np.object)
    names = da.from_array(np_names, chunks=(chunks['row'], np_names.shape[1]))

    ds = Dataset({"POLARIZATION_TYPE": (("row", "xyz"), names)})
    table_name = str(tmp_path / "test.table")
    writes = write_datasets(table_name, ds, ["POLARIZATION_TYPE"])
    dask.compute(writes)

    del writes
    assert_liveness(0, 0)

    datasets = read_datasets(table_name, [], [], [],
                             chunks={'row': chunks['row']})
    assert len(datasets) == 1
    assert_array_equal(datasets[0].POLARIZATION_TYPE.data, np_names)
    del datasets
    assert_liveness(0, 0)
コード例 #11
0
def xds_to_parquet(xds, store, columns=None, **kwargs):
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (str, Path)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_to_parquet: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    datasets = []
    base_schema = ArrowSchema.from_datasets(xds)

    for ds_id, ds in enumerate(xds):
        arrow_schema = base_schema.with_attributes(ds)
        fragment = ParquetFragment(store, store.table, arrow_schema, ds_id)
        chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1)
        args = [chunk_ids, ("row", )]

        data_var_it = column_iterator(ds.data_vars, columns)
        coord_it = column_iterator(ds.coords, columns)

        for column, variable in itertools.chain(data_var_it, coord_it):
            if not isinstance(variable.data, da.Array):
                raise ValueError(f"Column {column} does not "
                                 f"contain a dask Array")

            if len(variable.dims[0]) == 0 or variable.dims[0] != "row":
                raise ValueError(f"Column {column} dimensions "
                                 f"{variable.dims} don't start with 'row'")

            args.extend((column, None, variable.data, variable.dims))

            for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]):
                if len(chunk) != 1:
                    raise ValueError(f"Chunking in {dim} is not yet "
                                     f"supported.")

        writes = da.blockwise(fragment.write, ("row", ),
                              *args,
                              align_arrays=False,
                              adjust_chunks={"row": 1},
                              meta=np.empty((0, ), bool))

        writes = inlined_array(writes, chunk_ids)

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs))

    return datasets
コード例 #12
0
ファイル: dask_ms.py プロジェクト: smasoka/dask-ms
def xds_to_table(xds,
                 table_name,
                 columns,
                 descriptor=None,
                 table_keywords=None,
                 column_keywords=None,
                 table_proxy=False):
    """
    Generates a list of Datasets representing a write operations from the
    specified arrays in :class:`xarray.Dataset`'s into
    the CASA table columns specified by ``table_name`` and ``columns``.
    This is lazy operation -- it is only execute when a :meth:`dask.compute`
    or :meth:`xarray.Dataset.compute` method is called.

    Parameters
    ----------
    xds : :class:`xarray.Dataset` or list of :class:`xarray.Dataset`
        dataset(s) containing the specified columns. If a list of datasets
        is provided, the concatenation of the columns in
        sequential datasets will be written.

    table_name : str
        CASA table path

    columns : tuple or list or "ALL"
        list of column names to write to the table.

        "ALL" is a special marker which specifies that all columns
        should be written. If you wish to write an "ALL" array to
        a column, use :code:`columns=['ALL']`

    descriptor : None or \
        :class:`~daskms.descriptors.builder.AbstractBuilderFactory` or \
        str

        A class describing how CASA table descriptors and data managers
        are constructors. Some defaults are available such
        as `ms` and `ms_subtable`.

        If None, defaults are used.

    table_keywords : dict, optional
        Dictionary of table keywords to add to existing keywords.
        The operation is performed immediately, not lazily.

    column_keywords : dict, optional
        Dictionary of :code:`{column: keywords}` to add to existing
        column keywords.
        The operation is performed immediately, not lazily.

    table_proxy : {False, True}
        If True returns the table_proxy

    Returns
    -------
    write_datasets : list of :class:`xarray.Dataset`
        Datasets containing arrays representing write operations
        into a CASA Table
    table_proxy : :class:`daskms.TableProxy`, optional
        The Table Proxy associated with the datasets
    """

    # Promote dataset to a list
    if not isinstance(xds, (tuple, list)):
        xds = [xds]

    if not isinstance(columns, (tuple, list)):
        if columns != "ALL":
            columns = [columns]

    datasets = []

    # No xarray available, assume dask datasets
    if xr is None:
        datasets = xds
    else:
        for ds in xds:
            if isinstance(ds, Dataset):
                # Already a dask dataset
                datasets.append(ds)
            elif isinstance(ds, xr.Dataset):
                # Produce a list of internal variable and dataset types
                # from the xarray Dataset
                variables = {
                    k: (v.dims, v.data, v.attrs)
                    for k, v in ds.data_vars.items()
                }

                coords = {
                    k: (v.dims, v.data, v.attrs)
                    for k, v in ds.coords.items()
                }

                dds = Dataset(variables, attrs=ds.attrs, coords=coords)
                datasets.append(dds)
            else:
                raise TypeError("Invalid Dataset type '%s'" % type(ds))

    # Write the datasets
    out_ds = write_datasets(table_name,
                            datasets,
                            columns,
                            descriptor=descriptor,
                            table_keywords=table_keywords,
                            column_keywords=column_keywords,
                            table_proxy=table_proxy)

    # No xarray available assume dask datasets
    if xr is None:
        return out_ds

    # Unpack table proxy if it was requested
    if table_proxy is True:
        assert isinstance(out_ds, tuple)
        out_ds, tp = out_ds
        assert isinstance(tp, TableProxy)
    else:
        tp = None

    if isinstance(out_ds, Dataset):
        out_ds = [out_ds]
    elif isinstance(out_ds, (tuple, list)):
        pass
    else:
        raise TypeError("Invalid Dataset type '%s'" % type(out_ds))

    xformed_out_ds = []

    for ds in out_ds:
        assert isinstance(ds, Dataset)

        variables = {
            k: (v.dims, v.data, v.attrs)
            for k, v in ds.data_vars.items()
        }

        coords = {k: (v.dims, v.data, v.attrs) for k, v in ds.coords.items()}

        xformed_out_ds.append(
            xr.Dataset(variables, coords=coords, attrs=ds.attrs))

    if len(xformed_out_ds) == 0:
        return xr.Dataset()
    elif len(xformed_out_ds) == 1:
        xformed_out_ds = xformed_out_ds[0]

    # Repack the Table Proxy
    if table_proxy is True:
        return xformed_out_ds, tp

    return xformed_out_ds
コード例 #13
0
def _write_datasets(table, table_proxy, datasets, columns, descriptor,
                    table_keywords, column_keywords):
    _, table_name, subtable = table_path_split(table)
    table_name = '::'.join((table_name, subtable)) if subtable else table_name
    row_orders = []

    # Put table and column keywords
    table_proxy.submit(_put_keywords, WRITELOCK, table_keywords,
                       column_keywords).result()

    # Sort datasets on (not has "ROWID", index) such that
    # datasets with ROWID's are handled first, while
    # those without (which imply appends to the MS)
    # are handled last
    sorted_datasets = sorted(enumerate(datasets),
                             key=lambda t:
                             ("ROWID" not in t[1].data_vars, t[0]))

    # Establish row orders for each dataset
    for di, ds in sorted_datasets:
        try:
            rowid = ds.ROWID.data
        except AttributeError:
            # Add operation
            # No ROWID's, assume they're missing from the table
            # and remaining datasets. Generate addrows
            # NOTE(sjperkins)
            # This could be somewhat brittle, but exists to
            # update MS empty subtables once they've been
            # created along with the main MS by a call to default_ms.
            # Users could also it to append rows to an existing table.
            # An xds_append_to_table may be a better solution...
            last_datasets = datasets[di:]
            last_row_orders = add_row_order_factory(table_proxy, last_datasets)

            # We don't inline the row ordering if it is derived
            # from the row sizes of provided arrays.
            # The range of possible dependencies are far too large to inline
            row_orders.extend([(False, lro) for lro in last_row_orders])
            # We have established row orders for all datasets
            # at this point, quit the loop
            break
        else:
            # Update operation
            # Generate row orderings from existing row IDs
            row_order = rowid.map_blocks(row_run_factory,
                                         sort_dir="write",
                                         dtype=np.object)

            # TODO(sjperkins)
            # There's an assumption here that rowid is an
            # operation with minimal dependencies
            # (i.e. derived from xds_from_{ms, table})
            # Caching flattens the graph into a single layer
            if len(row_order.__dask_graph__().layers) > 1:
                log.warning("Caching an update row ordering "
                            "with more than one layer")

            row_order = cached_array(row_order)
            # Inline the row ordering in the graph
            row_orders.append((True, row_order))

    assert len(row_orders) == len(datasets)

    datasets = []

    for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders):
        # Hold the variables representing array writes
        write_vars = {}

        # Generate a dask array for each column
        for column in columns:
            try:
                variable = ds.data_vars[column]
            except KeyError:
                log.warning("Ignoring '%s' not present "
                            "on dataset %d" % (column, di))
                continue
            else:
                full_dims = variable.dims
                array = variable.data

            if not isinstance(array, da.Array):
                raise TypeError("%s on dataset %d is not a dask Array "
                                "but a %s" % (column, di, type(array)))

            args = [row_order, ("row", )]

            # We only need to pass in dimension extent arrays if
            # there is more than one chunk in any of the non-row columns.
            # In that case, we can putcol, otherwise putcolslice is required
            if not all(len(c) == 1 for c in array.chunks[1:]):
                # Add extent arrays
                for d, c in zip(full_dims[1:], array.chunks[1:]):
                    args.append(dim_extents_array(d, c))
                    args.append((d, ))

            # Add other variables
            args.extend([table_proxy, None, column, None, array, full_dims])

            # Name of the dask array representing this column
            token = dask.base.tokenize(di, args)
            name = "-".join((table_name, 'write', column, token))

            write_col = da.blockwise(
                putter_wrapper,
                full_dims,
                *args,
                # All dims shrink to 1,
                # a single bool is returned
                adjust_chunks={d: 1
                               for d in full_dims},
                name=name,
                align_arrays=False,
                dtype=np.bool)

            if inline:
                write_col = inlined_array(write_col, [row_order])

            write_vars[column] = (full_dims, write_col)

        # Append a dataset with the write operations
        datasets.append(Dataset(write_vars))

    # Return an empty dataset
    if len(datasets) == 0:
        return Dataset({})
    # Return singleton
    elif len(datasets) == 1:
        return datasets[0]

    return datasets
コード例 #14
0
def xds_from_zarr(store, columns=None, chunks=None):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """
    store, table = store_path_split(store)

    if isinstance(store, Path):
        store = str(store)

    if not isinstance(store, str):
        raise TypeError("store must be a Path, str")

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    table_group = zarr.open(store)[table]

    for g, (group_name, group) in enumerate(sorted(table_group.groups())):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
コード例 #15
0
def xds_from_parquet(store, columns=None, chunks=None):
    store, table = store_path_split(store)
    store = store / table

    if not isinstance(store, Path):
        store = Path(store)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if len(chunks) == 0 or any(not isinstance(c, dict) for c in chunks):
            raise TypeError("chunks must be None or dict or list of dict")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None or dict or list of dict")

    fragments = store.rglob("*.parquet")
    ds_cfg = defaultdict(list)

    # Iterate over all parquet files in the directory tree
    # and group them by partition
    partition_schemas = set()

    for fragment in fragments:
        *partitions, parquet_file = fragment.relative_to(store).parts
        fragment = ParquetFileProxy(fragment)
        fragment_meta = fragment.metadata
        metadata = json.loads(fragment_meta.metadata[DASKMS_METADATA.encode()])
        partition_meta = metadata[DASKMS_PARTITION_KEY]
        partition_meta = tuple(tuple((f, v)) for f, v in partition_meta)
        partitions = _partition_values(partitions, partition_meta)
        partition_schemas.add(partition_meta)
        ds_cfg[partitions].append(fragment)

    # Sanity check partition schemas of all parquet files
    if len(partition_schemas) == 0:
        raise ValueError(f"No parquet files found in {store}")
    elif len(partition_schemas) != 1:
        raise ValueError(f"Multiple partitions discovered {partition_schemas}")

    partition_schemas = partition_schemas.pop()
    datasets = []

    # Now create a dataset per partition
    for p, (partition, fragments) in enumerate(sorted(ds_cfg.items())):
        fragments = list(sorted(fragments))
        column_arrays = defaultdict(list)
        fragment_rows = [f.metadata.num_rows for f in fragments]

        for (f, (start, end)) in partition_chunking(p, fragment_rows, chunks):
            fragment = fragments[f]
            fragment_meta = fragment.metadata
            rows = fragment_meta.num_rows
            schema = fragment_meta.schema.to_arrow_schema()
            fields = {n: schema.field(n) for n in schema.names}

            for column, field in column_iterator(fields, columns):
                field_metadata = field.metadata[DASKMS_METADATA.encode()]
                field_metadata = json.loads(field_metadata)
                dims = tuple(field_metadata["dims"])

                if isinstance(field.type, TensorType):
                    shape = (rows,) + field.type.shape
                else:
                    shape = (rows,)

                assert len(shape) == len(dims)

                meta = np.empty((0,)*len(dims), field.type.to_pandas_dtype())
                new_axes = {d: s for d, s in zip(dims, shape)}

                read = da.blockwise(fragment.read_column, dims,
                                    column, None,
                                    start, None,
                                    end, None,
                                    new_axes=new_axes,
                                    meta=meta)

                column_arrays[column].append((read, dims))

        data_vars = {}

        for column, values in column_arrays.items():
            arrays, array_dims = zip(*values)
            array_dims = set(array_dims)

            if not len(array_dims) == 1:
                raise ValueError(f"Inconsistent array dimensions "
                                 f"{array_dims} for {column}")

            data_vars[column] = (array_dims.pop(), da.concatenate(arrays))

        attrs = dict(partition)
        attrs[DASKMS_PARTITION_KEY] = partition_schemas
        datasets.append(Dataset(data_vars, attrs=attrs))

    return datasets
コード例 #16
0
def xds_from_parquet(store, columns=None, chunks=None, **kwargs):
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (str, Path)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_parquet: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if len(chunks) == 0 or any(not isinstance(c, dict) for c in chunks):
            raise TypeError("chunks must be None or dict or list of dict")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None or dict or list of dict")

    table_path = "" if store.table else "MAIN"

    fragments = list(map(Path, store.rglob("*.parquet")))
    ds_cfg = defaultdict(list)

    # Iterate over all parquet files in the directory tree
    # and group them by partition
    partition_schemas = set()

    for fragment in fragments:
        *partitions, _ = fragment.relative_to(Path(table_path)).parts
        fragment = ParquetFileProxy(store, str(fragment))
        fragment_meta = fragment.metadata
        metadata = json.loads(fragment_meta.metadata[DASKMS_METADATA.encode()])
        partition_meta = metadata[DASKMS_PARTITION_KEY]
        partition_meta = tuple(tuple((f, v)) for f, v in partition_meta)
        partitions = _partition_values(partitions, partition_meta)
        partition_schemas.add(partition_meta)
        ds_cfg[partitions].append(fragment)

    # Sanity check partition schemas of all parquet files
    if len(partition_schemas) == 0:
        raise ValueError(f"No parquet files found in {store.path}")
    elif len(partition_schemas) != 1:
        raise ValueError(f"Multiple partitions discovered {partition_schemas}")

    partition_schemas = partition_schemas.pop()
    datasets = []

    # Now create a dataset per partition
    for p, (partition, fragments) in enumerate(sorted(ds_cfg.items())):
        fragments = list(sorted(fragments))
        column_arrays = defaultdict(list)
        fragment_rows = [f.metadata.num_rows for f in fragments]

        # Returns a dictionary of lists mapping fragments to partitions.
        partition_chunks = partition_chunking(p, fragment_rows, chunks)

        for pieces in partition_chunks.values():

            chunk_fragments = [fragments[i] for i, _ in pieces]
            chunk_ranges = [r for _, r in pieces]
            chunk_metas = [f.metadata for f in chunk_fragments]

            rows = sum(end - start for start, end in chunk_ranges)

            # NOTE(JSKenyon): This assumes that the schema/fields are
            # consistent between fragments. This should be ok.
            exemplar_schema = chunk_metas[0].schema.to_arrow_schema()
            exemplar_fields = {
                n: exemplar_schema.field(n)
                for n in exemplar_schema.names
            }

            for column, field in column_iterator(exemplar_fields, columns):
                field_metadata = field.metadata[DASKMS_METADATA.encode()]
                field_metadata = json.loads(field_metadata)
                dims = tuple(field_metadata["dims"])

                if isinstance(field.type, TensorType):
                    shape = (rows, ) + field.type.shape
                else:
                    shape = (rows, )

                assert len(shape) == len(dims)

                dtype = field.type.to_pandas_dtype()
                meta = np.empty((0, ) * len(dims), dtype)
                new_axes = {d: s for d, s in zip(dims, shape)}

                read = da.blockwise(fragment_reader,
                                    dims,
                                    chunk_fragments,
                                    None,
                                    chunk_ranges,
                                    None,
                                    column,
                                    None,
                                    shape,
                                    None,
                                    dtype,
                                    None,
                                    adjust_chunks={"row": rows},
                                    new_axes=new_axes,
                                    meta=meta)

                column_arrays[column].append((read, dims))

        data_vars = {}

        for column, values in column_arrays.items():
            arrays, array_dims = zip(*values)
            array_dims = set(array_dims)

            if not len(array_dims) == 1:
                raise ValueError(f"Inconsistent array dimensions "
                                 f"{array_dims} for {column}")

            data_vars[column] = (array_dims.pop(), da.concatenate(arrays))

        attrs = dict(partition)
        attrs[DASKMS_PARTITION_KEY] = partition_schemas
        datasets.append(Dataset(data_vars, attrs=attrs))

    return datasets
コード例 #17
0
def test_bda_channelisation():
    rs = np.random.RandomState(42)

    spw_id = (0, 1, 2)
    pol_id = (0, 1)

    nchan = np.array([4, 8, 16])
    bandwidth = np.array([20.0, 40.0, 60.0])
    ref_freq = np.array([100.0, 200.0, 500.0])
    chan_freq = [
        np.linspace(rf - bw / 2, rf + bw / 2, nc)[None, :]
        for rf, bw, nc in zip(ref_freq, bandwidth, nchan)
    ]
    chan_width = [
        np.full(nc, bw / nc)[None, :] for nc, bw in zip(nchan, bandwidth)
    ]

    spw_id, pol_id = (np.array(a) for a in zip(*product(spw_id, pol_id)))

    ddid = np.array([5, 1, 3, 2, 0, 4, 1, 2, 5, 3])
    time = np.linspace(5.03373334e+09, 5.03373362e+09, ddid.shape[0])
    row_chunks = (4, 3, 1, 2)
    row_spws = spw_id[ddid]
    num_chan = nchan[row_spws]

    row_num_chans = np.array([rs.randint(1, num_chan[s]) for s in row_spws])

    da_ddid = da.from_array(ddid, chunks=(row_chunks, ))
    da_num_chan = da.from_array(
        row_num_chans,
        chunks=(row_chunks),
    )
    da_time = da.from_array(time, chunks=(row_chunks, ))

    out_ds = [
        Dataset({
            "DATA_DESC_ID": (("row", ), da_ddid),
            "TIME": (("row", ), da_time),
            "NUM_CHAN": (("row", ), da_num_chan),
            "DECORR_CHAN_WIDTH": (("row", ), da.zeros_like(time))
        })
    ]

    ddid_ds = Dataset({
        "SPECTRAL_WINDOW_ID": (("row", ), spw_id),
        "POLARIZATION_ID": (("row", ), pol_id),
    })

    spw_ds = [
        Dataset({
            "REF_FREQUENCY": (("row", ), da.from_array([rf], chunks=1)),
            "NUM_CHAN": (("row", ), da.from_array([nc], chunks=1)),
            "CHAN_FREQ":
            (("row", ), da.from_array(chan_freq[spw], chunks=(1, nc))),
            "CHAN_WIDTH":
            (("row", ), da.from_array(chan_width[spw], chunks=(1, nc))),
            "RESOLUTION": (("row", ),
                           da.from_array(chan_width[spw], chunks=(1, nc))),
            "EFFECTIVE_BW": (("row", ),
                             da.from_array(chan_width[spw], chunks=(1, nc))),
            "TOTAL_BANDWIDTH": (("row"), da.from_array([bw], chunks=1))
        }) for spw, (rf, bw, nc) in enumerate(zip(ref_freq, bandwidth, nchan))
    ]

    out_ds, spw_ds, ddid_ds = bda_average_spw(out_ds, ddid_ds, spw_ds)
    out_ds, spw_ds, ddid_ds = da.compute(out_ds, spw_ds, ddid_ds)
コード例 #18
0
def xds_to_zarr(xds, store, columns=None, rechunk=False, **kwargs):
    """
    Stores a dataset of list of datasets defined by `xds` in
    file location `store`.

    Parameters
    ----------
    xds : Dataset or list of Datasets
        Data
    store : str or Path
        Path to store the data
    columns : list of str or str or None
        Columns to store. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied. All coordinates
        associated with a specified column will be written automatically.
    rechunk : bool
        Controls whether dask arrays should be automatically rechunked to be
        consistent with existing on-disk zarr arrays while writing to disk.
    **kwargs : optional

    Returns
    -------
    writes : Dataset
        A Dataset representing the write operations
    """
    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_to_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    write_datasets = []

    for di, ds in enumerate(xds):

        data_vars, coords = select_vars_and_coords(ds, columns)

        # Create a new ds which is consistent with what we want to write.
        ds = Dataset(data_vars, coords=coords, attrs=ds.attrs)

        ds, group = prepare_zarr_group(di, ds, store, rechunk=rechunk)

        data_vars = dict(_gen_writes(ds.data_vars, ds.chunks, group))
        # Include coords in the write dataset so they're reified
        data_vars.update(
            dict(_gen_writes(ds.coords, ds.chunks, group, indirect_dims=True)))

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        write_datasets.append(Dataset(data_vars, attrs=attrs))

    return write_datasets
コード例 #19
0
def xds_to_parquet(xds, path, columns=None):
    path, table = store_path_split(path)

    if not isinstance(path, Path):
        path = Path(path)

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    datasets = []
    base_schema = ArrowSchema.from_datasets(xds)

    for ds_id, ds in enumerate(xds):
        arrow_schema = base_schema.with_attributes(ds)
        fragment = ParquetFragment(path / table, arrow_schema, ds_id)
        chunk_ids = da.arange(len(ds.chunks["row"]), chunks=1)
        args = [chunk_ids, ("row", )]

        data_var_it = column_iterator(ds.data_vars, columns)
        coord_it = column_iterator(ds.coords, columns)

        for column, variable in itertools.chain(data_var_it, coord_it):
            if not isinstance(variable.data, da.Array):
                raise ValueError(f"Column {column} does not "
                                 f"contain a dask Array")

            if len(variable.dims[0]) == 0 or variable.dims[0] != "row":
                raise ValueError(f"Column {column} dimensions "
                                 f"{variable.dims} don't start with 'row'")

            args.extend((column, None, variable.data, variable.dims))

            for dim, chunk in zip(variable.dims[1:], variable.data.chunks[1:]):
                if len(chunk) != 1:
                    raise ValueError(f"Chunking in {dim} is not yet "
                                     f"supported.")

        writes = da.blockwise(fragment.write, ("row", ),
                              *args,
                              align_arrays=False,
                              adjust_chunks={"row": 1},
                              meta=np.empty((0, ), np.bool))

        writes = inlined_array(writes, chunk_ids)

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        datasets.append(Dataset({"WRITE": (("row", ), writes)}, attrs=attrs))

    return datasets
コード例 #20
0
def xds_to_zarr(xds, store, columns=None):
    """
    Stores a dataset of list of datasets defined by `xds` in
    file location `store`.

    Parameters
    ----------
    xds : Dataset or list of Datasets
        Data
    store : str or Path
        Path to store the data
    columns : list of str or str or None
        Columns to store. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.

    Returns
    -------
    writes : Dataset
        A Dataset representing the write operations
    """
    store, table = store_path_split(store)

    if isinstance(store, Path):
        store = str(store)

    if not isinstance(store, str):
        raise TypeError(f"store '{store}' must be Path or str")

    columns = promote_columns(columns)

    if isinstance(xds, Dataset):
        xds = [xds]
    elif isinstance(xds, (tuple, list)):
        if not all(isinstance(ds, Dataset) for ds in xds):
            raise TypeError("xds must be a Dataset or list of Datasets")
    else:
        raise TypeError("xds must be a Dataset or list of Datasets")

    write_datasets = []

    for di, ds in enumerate(xds):
        group = prepare_zarr_group(di, ds, store, table)
        write_args = (ds.chunks, columns, group)

        data_vars = dict(_gen_writes(ds.data_vars, *write_args))
        # Include coords in the write dataset so they're reified
        data_vars.update(dict(_gen_writes(ds.coords, *write_args)))

        # Transfer any partition information over to the write dataset
        partition = ds.attrs.get(DASKMS_PARTITION_KEY, False)

        if not partition:
            attrs = None
        else:
            attrs = {
                DASKMS_PARTITION_KEY: partition,
                **{k: getattr(ds, k)
                   for k, _ in partition}
            }

        write_datasets.append(Dataset(data_vars, attrs=attrs))

    return write_datasets
コード例 #21
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets