Example #1
0
    def __init__(self, table, select_cols, group_cols, index_cols, **kwargs):
        if not table_exists(table):
            raise ValueError("'%s' does not appear to be a CASA Table" % table)

        chunks = kwargs.pop('chunks', [{'row': _DEFAULT_ROW_CHUNKS}])

        # Create or promote chunks to a list of dicts
        if isinstance(chunks, dict):
            chunks = [chunks]
        elif not isinstance(chunks, (tuple, list)):
            raise TypeError("'chunks' must be a dict or sequence of dicts")

        self.canonical_name = table
        self.table_path = str(Path(*table_path_split(table)))
        self.select_cols = select_cols
        self.group_cols = [] if group_cols is None else group_cols
        self.index_cols = [] if index_cols is None else index_cols
        self.chunks = chunks
        self.table_schema = kwargs.pop('table_schema', None)
        self.taql_where = kwargs.pop('taql_where', '')
        self.table_keywords = kwargs.pop('table_keywords', False)
        self.column_keywords = kwargs.pop('column_keywords', False)
        self.table_proxy = kwargs.pop('table_proxy', False)

        if len(kwargs) > 0:
            raise ValueError("Unhandled kwargs: %s" % kwargs)
Example #2
0
def executor_key(table_name):
    """
    Product an executor key from table_name
    """

    # Remove any path separators
    root, table_name, subtable = table_path_split(table_name)
    return str(Path(root, table_name))
def test_read_array_names(ms):
    _, short_name, _ = table_path_split(ms)
    datasets = xds_from_ms(ms)

    for ds in datasets:
        for k, v in ds.data_vars.items():
            product = ("~[" + str(ds.FIELD_ID) +
                       "," + str(ds.DATA_DESC_ID) + "]")
            prefix = "".join(("read~", k, product))
            assert key_split(v.data.name) == prefix
Example #4
0
    def _group_datasets(self, table_proxy, groups, exemplar_rows, orders):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = '/'.join((t, s)) if s else t
        table_schema = self._table_schema()

        datasets = []
        group_ids = list(zip(*groups))

        assert len(group_ids) == len(orders)

        # Select columns, excluding grouping columns
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        select_cols -= set(self.group_cols)

        # Create a dataset for each group
        it = enumerate(zip(group_ids, exemplar_rows, orders))

        for g, (group_id, exemplar_row, order) in it:
            # Extract group chunks
            try:
                group_chunks = self.chunks[g]  # Get group chunking strategy
            except IndexError:
                group_chunks = self.chunks[-1]  # Re-use last group's chunks

            # Prefix dataset
            gid_str = ",".join(str(gid) for gid in group_id)
            array_suffix = f"[{gid_str}]-{short_table_name}"

            # Create dataset variables
            group_var_dims = _dataset_variable_factory(
                table_proxy, table_schema, select_cols, exemplar_row, order,
                group_chunks, array_suffix)

            # Extract ROWID
            try:
                rowid = group_var_dims.pop("ROWID")
            except KeyError:
                coords = None
            else:
                coords = {"ROWID": rowid}

            # Assign values for the dataset's grouping columns
            # as attributes
            partitions = tuple(
                (c, g.dtype.name) for c, g in zip(self.group_cols, group_id))
            attrs = {DASKMS_PARTITION_KEY: partitions}

            # Use python types which are json serializable
            group_id = [gid.item() for gid in group_id]
            attrs.update(zip(self.group_cols, group_id))

            datasets.append(Dataset(group_var_dims, attrs=attrs,
                                    coords=coords))

        return datasets
Example #5
0
def infer_table_type(table_name):
    """ Guess the schema from the table name """
    _, table, subtable = table_path_split(table_name)

    if not subtable and table[-3:].upper().endswith(".MS"):
        return "MS"

    if subtable in _SUBTABLE_SCHEMAS.keys():
        return subtable

    return "TABLE"
def test_write_array_names(ms, tmp_path):
    _, short_name, _ = table_path_split(ms)
    datasets = xds_from_ms(ms)

    out_table = str(tmp_path / short_name)

    writes = xds_to_table(datasets, out_table, "ALL")

    for ds in writes:
        for k, v in ds.data_vars.items():
            prefix = "".join(("write~", k))
            assert key_split(v.data.name) == prefix
Example #7
0
def _create_table(table_name, datasets, columns, descriptor):
    builder = descriptor_builder(table_name, descriptor)
    schemas = [DatasetSchema.from_dataset(ds, columns) for ds in datasets]
    table_desc, dminfo = builder.execute(schemas)

    root, table, subtable = table_path_split(table_name)
    table_path = root / table

    from daskms.descriptors.ms import MSDescriptorBuilder
    from daskms.descriptors.ms_subtable import MSSubTableDescriptorBuilder

    if not subtable and isinstance(builder, MSDescriptorBuilder):
        table_path = str(table_path)

        # Create the MS
        with pt.default_ms(table_path, tabdesc=table_desc, dminfo=dminfo):
            pass

        return _writable_table_proxy(table_path)
    elif subtable:
        # NOTE(sjperkins)
        # Recreate the subtable path with OS separator components
        # This avoids accessing the subtable via the main table
        # (e.g. WSRT.MS::SOURCE)
        # which can cause lock issues as the subtables seemingly
        # inherit the parent table lock
        subtable_path = str(table_path / subtable)

        # Create the subtable
        if isinstance(builder, MSSubTableDescriptorBuilder):
            with pt.default_ms_subtable(subtable,
                                        subtable_path,
                                        tabdesc=table_desc,
                                        dminfo=dminfo):
                pass
        else:
            with pt.table(subtable_path, table_desc, dminfo=dminfo, ack=False):
                pass

        # Add subtable to the main table
        table_proxy = _writable_table_proxy(str(table_path))
        table_proxy.putkeywords({subtable: "Table: " + subtable_path}).result()
        del table_proxy

        # Return TableProxy
        return _writable_table_proxy(subtable_path)
    else:
        # Create the table
        with pt.table(str(table_path), table_desc, dminfo=dminfo, ack=False):
            pass

        return _writable_table_proxy(str(table_path))
Example #8
0
    def _group_datasets(self, groups, exemplar_rows, orders):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = '/'.join((t, s)) if s else t
        table_proxy = self._table_proxy()
        table_schema = self._table_schema()

        datasets = []
        group_ids = list(zip(*groups))

        assert len(group_ids) == len(orders)

        # Select columns, excluding grouping columns
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        select_cols -= set(self.group_cols)

        # Create a dataset for each group
        it = enumerate(zip(group_ids, exemplar_rows, orders))

        for g, (group_id, exemplar_row, order) in it:
            # Extract group chunks
            try:
                group_chunks = self.chunks[g]  # Get group chunking strategy
            except IndexError:
                group_chunks = self.chunks[-1]  # Re-use last group's chunks

            # Prefix d
            gid_str = ",".join(str(gid) for gid in group_id)
            array_prefix = "%s-[%s]" % (short_table_name, gid_str)

            # Create dataset variables
            group_var_dims = _dataset_variable_factory(
                table_proxy, table_schema, select_cols, exemplar_row, order,
                group_chunks, array_prefix)

            # Extract ROWID
            try:
                rowid = group_var_dims.pop("ROWID")
            except KeyError:
                coords = None
            else:
                coords = {"ROWID": rowid}

            # Assign values for the dataset's grouping columns
            # as attributes
            attrs = dict(zip(self.group_cols, group_id))

            datasets.append(Dataset(group_var_dims, attrs=attrs,
                                    coords=coords))

        return datasets
Example #9
0
    def _single_dataset(self, table_proxy, orders, exemplar_row=0):
        _, t, s = table_path_split(self.canonical_name)
        short_table_name = "/".join((t, s)) if s else t

        table_schema = self._table_schema()
        select_cols = set(self.select_cols or table_proxy.colnames().result())
        variables = _dataset_variable_factory(table_proxy, table_schema,
                                              select_cols, exemplar_row,
                                              orders, self.chunks[0],
                                              short_table_name)

        try:
            rowid = variables.pop("ROWID")
        except KeyError:
            coords = None
        else:
            coords = {"ROWID": rowid}

        return Dataset(variables, coords=coords)
Example #10
0
def filename_builder_factory(filename):
    """
    Returns a Table Descriptor Builder based on the filename.

    1. If ending with a '.ms' (case insensitive), its assumed
       a Measurement Set is being created.
    2. If ending in '::SUBTABLE' where SUBTABLE is a
       Measurement Set sub-table such as ANTENNA, SPECTRAL_WINDOW,
       its assumed that sub-table is being created.
    3. Otherwise its assumed a default CASA table is being created.


    Parameters
    ----------
    filename : str
        Table filename

    Returns
    -------
    builder : :class:`daskms.descriptors.builder.AbtractDescriptorBuilder`
        Table Descriptor builder based on the filename
    """
    _, table, subtable = table_path_split(filename)

    # Does this look like an MS
    if not subtable and table[-3:].upper().endswith('.MS'):
        from daskms.descriptors.ms import MSDescriptorBuilder
        return MSDescriptorBuilder()

    # Perhaps its an MS subtable?
    if subtable in SUBTABLES:
        from daskms.descriptors.ms_subtable import MSSubTableDescriptorBuilder
        return MSSubTableDescriptorBuilder(subtable)

    # Just a standard CASA Table I guess
    from daskms.descriptors.builder import DefaultDescriptorBuilder
    return DefaultDescriptorBuilder()
Example #11
0
def test_table_path_split(path, root, table, subtable):
    assert (root, table, subtable) == table_path_split(path)
Example #12
0
def _write_datasets(table, table_proxy, datasets, columns, descriptor,
                    table_keywords, column_keywords):
    _, table_name, subtable = table_path_split(table)
    table_name = '::'.join((table_name, subtable)) if subtable else table_name
    row_orders = []

    # Put table and column keywords
    table_proxy.submit(_put_keywords, WRITELOCK, table_keywords,
                       column_keywords).result()

    # Sort datasets on (not has "ROWID", index) such that
    # datasets with ROWID's are handled first, while
    # those without (which imply appends to the MS)
    # are handled last
    sorted_datasets = sorted(enumerate(datasets),
                             key=lambda t:
                             ("ROWID" not in t[1].data_vars, t[0]))

    # Establish row orders for each dataset
    for di, ds in sorted_datasets:
        try:
            rowid = ds.ROWID.data
        except AttributeError:
            # Add operation
            # No ROWID's, assume they're missing from the table
            # and remaining datasets. Generate addrows
            # NOTE(sjperkins)
            # This could be somewhat brittle, but exists to
            # update MS empty subtables once they've been
            # created along with the main MS by a call to default_ms.
            # Users could also it to append rows to an existing table.
            # An xds_append_to_table may be a better solution...
            last_datasets = datasets[di:]
            last_row_orders = add_row_order_factory(table_proxy, last_datasets)

            # We don't inline the row ordering if it is derived
            # from the row sizes of provided arrays.
            # The range of possible dependencies are far too large to inline
            row_orders.extend([(False, lro) for lro in last_row_orders])
            # We have established row orders for all datasets
            # at this point, quit the loop
            break
        else:
            # Update operation
            # Generate row orderings from existing row IDs
            row_order = rowid.map_blocks(row_run_factory,
                                         sort_dir="write",
                                         dtype=np.object)

            # TODO(sjperkins)
            # There's an assumption here that rowid is an
            # operation with minimal dependencies
            # (i.e. derived from xds_from_{ms, table})
            # Caching flattens the graph into a single layer
            if len(row_order.__dask_graph__().layers) > 1:
                log.warning("Caching an update row ordering "
                            "with more than one layer")

            row_order = cached_array(row_order)
            # Inline the row ordering in the graph
            row_orders.append((True, row_order))

    assert len(row_orders) == len(datasets)

    datasets = []

    for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders):
        # Hold the variables representing array writes
        write_vars = {}

        # Generate a dask array for each column
        for column in columns:
            try:
                variable = ds.data_vars[column]
            except KeyError:
                log.warning("Ignoring '%s' not present "
                            "on dataset %d" % (column, di))
                continue
            else:
                full_dims = variable.dims
                array = variable.data

            if not isinstance(array, da.Array):
                raise TypeError("%s on dataset %d is not a dask Array "
                                "but a %s" % (column, di, type(array)))

            args = [row_order, ("row", )]

            # We only need to pass in dimension extent arrays if
            # there is more than one chunk in any of the non-row columns.
            # In that case, we can putcol, otherwise putcolslice is required
            if not all(len(c) == 1 for c in array.chunks[1:]):
                # Add extent arrays
                for d, c in zip(full_dims[1:], array.chunks[1:]):
                    args.append(dim_extents_array(d, c))
                    args.append((d, ))

            # Add other variables
            args.extend([table_proxy, None, column, None, array, full_dims])

            # Name of the dask array representing this column
            token = dask.base.tokenize(di, args)
            name = "-".join((table_name, 'write', column, token))

            write_col = da.blockwise(
                putter_wrapper,
                full_dims,
                *args,
                # All dims shrink to 1,
                # a single bool is returned
                adjust_chunks={d: 1
                               for d in full_dims},
                name=name,
                align_arrays=False,
                dtype=np.bool)

            if inline:
                write_col = inlined_array(write_col, [row_order])

            write_vars[column] = (full_dims, write_col)

        # Append a dataset with the write operations
        datasets.append(Dataset(write_vars))

    # Return an empty dataset
    if len(datasets) == 0:
        return Dataset({})
    # Return singleton
    elif len(datasets) == 1:
        return datasets[0]

    return datasets
Example #13
0
def _write_datasets(table, table_proxy, datasets, columns, descriptor,
                    table_keywords, column_keywords):
    _, table_name, subtable = table_path_split(table)
    table_name = '::'.join((table_name, subtable)) if subtable else table_name
    writes = []
    row_orders = []

    # Put table and column keywords
    table_proxy.submit(_put_keywords, WRITELOCK, table_keywords,
                       column_keywords).result()

    # Sort datasets on (not has "ROWID", index) such that
    # datasets with ROWID's are handled first, while
    # those without (which imply appends to the MS)
    # are handled last
    sorted_datasets = sorted(enumerate(datasets),
                             key=lambda t:
                             ("ROWID" not in t[1].data_vars, t[0]))

    # Establish row orders for each dataset
    for di, ds in sorted_datasets:
        try:
            rowid = ds.ROWID.data
        except AttributeError:
            # No ROWID's, assume they're missing from the table
            # and remaining datasets. Generate addrows
            # NOTE(sjperkins)
            # This could be somewhat brittle, but exists to
            # update of MS subtables once they've been
            # created (empty) along with the main MS by a call to default_ms.
            # Users could also it to append rows to an existing table.
            # An xds_append_to_table is probably the correct solution...
            last_datasets = datasets[di:]
            last_row_orders = add_row_order_factory(table_proxy, last_datasets)
            row_orders.extend(last_row_orders)
            # We have established row orders for all datasets
            # at this point, quit the loop
            break
        else:
            # Generate row orderings from existing row IDs
            row_order = rowid.map_blocks(row_run_factory,
                                         sort_dir="write",
                                         dtype=np.object)
            row_orders.append(row_order)

    assert len(row_orders) == len(datasets)

    for (di, ds), row_order in zip(sorted_datasets, row_orders):
        data_vars = ds.data_vars

        # Generate a dask array for each column
        for column in columns:
            try:
                variable = data_vars[column]
            except KeyError:
                log.warning("Ignoring '%s' not present "
                            "on dataset %d" % (column, di))
                continue
            else:
                full_dims = variable.dims
                array = variable.data

            args = [row_order, ("row", )]

            # We only need to pass in dimension extent arrays if
            # there is more than one chunk in any of the non-row columns.
            # In that case, we can putcol, otherwise putcolslice is required
            if not all(len(c) == 1 for c in array.chunks[1:]):
                # Add extent arrays
                for d, c in zip(full_dims[1:], array.chunks[1:]):
                    args.append(dim_extents_array(d, c))
                    args.append((d, ))

            # Add other variables
            args.extend([table_proxy, None, column, None, array, full_dims])

            # Name of the dask array representing this column
            token = dask.base.tokenize(di, args)
            name = "-".join((table_name, 'write', column, token))

            write_col = da.blockwise(
                putter_wrapper,
                full_dims,
                *args,
                # All dims shrink to 1,
                # a single bool is returned
                adjust_chunks={d: 1
                               for d in full_dims},
                name=name,
                align_arrays=False,
                dtype=np.bool)

            writes.append(write_col.ravel())

    if len(writes) == 0:
        return da.full(1, True, dtype=np.bool)

    return da.concatenate(writes)