def test_cached_array(ms): ds = xds_from_ms(ms, group_cols=[], chunks={'row': 1, 'chan': 4})[0] data = ds.DATA.data cached_data = cached_array(data) assert_array_almost_equal(cached_data, data) # 2 x row blocks + row x chan x corr blocks assert len(_key_cache) == data.numblocks[0] * 2 + data.npartitions # rows, row runs and data array cache's assert len(_array_cache_cache) == 3 # Pickling works pickled_data = pickle.loads(pickle.dumps(cached_data)) assert_array_almost_equal(pickled_data, data) # Same underlying caching is re-used # 2 x row blocks + row x chan x corr blocks assert len(_key_cache) == data.numblocks[0] * 2 + data.npartitions # rows, row runs and data array cache's assert len(_array_cache_cache) == 3 del pickled_data, cached_data, data, ds gc.collect() assert len(_key_cache) == 0 assert len(_array_cache_cache) == 0
def _group_ordering_arrays(taql_proxy, index_cols, group, group_nrows, group_row_chunks): """ Returns ------- sorted_rows : :class:`dask.array.Array` Sorted table rows chunked on ``group_row_chunks``. row_runs : :class:`dask.array.Array`. Array containing (row_run, resort) tuples. Should not be directly computed. Chunked on ``group_row_chunks``. """ token = dask.base.tokenize(taql_proxy, group, group_nrows) name = 'group-rows-' + token chunks = ((group_nrows, ), ) layers = {(name, 0): (_sorted_group_rows, taql_proxy, group, index_cols)} graph = HighLevelGraph.from_collections(name, layers, []) group_rows = da.Array(graph, name, chunks, dtype=np.int32) group_rows = cached_array(group_rows) try: shape = (group_nrows, ) group_row_chunks = normalize_chunks(group_row_chunks, shape=shape) except ValueError as e: raise GroupChunkingError("%s\n" "Unable to match chunks '%s' " "with shape '%s' for group '%d'. " "This can occur if too few chunk " "dictionaries have been supplied for " "the number of groups " "and an earlier group's chunking strategy " "is applied to a later one." % (str(e), group_row_chunks, shape, group)) group_rows = group_rows.rechunk(group_row_chunks) row_runs = group_rows.map_blocks(row_run_factory, sort_dir="read", dtype=np.object) row_runs = cached_array(row_runs) return group_rows, row_runs
def row_ordering(taql_proxy, index_cols, chunks): nrows = taql_proxy.nrows().result() chunks = normalize_chunks(chunks['row'], shape=(nrows, )) token = dask.base.tokenize(taql_proxy, index_cols, chunks, nrows) name = 'rows-' + token layers = {} start = 0 for i, c in enumerate(chunks[0]): layers[(name, i)] = (_sorted_rows, taql_proxy, start, c) start += c graph = HighLevelGraph.from_collections(name, layers, []) rows = da.Array(graph, name, chunks=chunks, dtype=np.int64) rows = cached_array(rows) row_runs = rows.map_blocks(row_run_factory, sort_dir="read", dtype=object) row_runs = cached_array(row_runs) return rows, row_runs
def test_cached_data_token(token): zeros = da.zeros(1000, chunks=100) carray = cached_array(zeros, token) dsk = dict(carray.__dask_graph__()) k, v = dsk.popitem() cache = v[1] if token is None: assert cache.token is not None else: assert cache.token == token
def _write_datasets(table, table_proxy, datasets, columns, descriptor, table_keywords, column_keywords): _, table_name, subtable = table_path_split(table) table_name = '::'.join((table_name, subtable)) if subtable else table_name row_orders = [] # Put table and column keywords table_proxy.submit(_put_keywords, WRITELOCK, table_keywords, column_keywords).result() # Sort datasets on (not has "ROWID", index) such that # datasets with ROWID's are handled first, while # those without (which imply appends to the MS) # are handled last sorted_datasets = sorted(enumerate(datasets), key=lambda t: ("ROWID" not in t[1].data_vars, t[0])) # Establish row orders for each dataset for di, ds in sorted_datasets: try: rowid = ds.ROWID.data except AttributeError: # Add operation # No ROWID's, assume they're missing from the table # and remaining datasets. Generate addrows # NOTE(sjperkins) # This could be somewhat brittle, but exists to # update MS empty subtables once they've been # created along with the main MS by a call to default_ms. # Users could also it to append rows to an existing table. # An xds_append_to_table may be a better solution... last_datasets = datasets[di:] last_row_orders = add_row_order_factory(table_proxy, last_datasets) # We don't inline the row ordering if it is derived # from the row sizes of provided arrays. # The range of possible dependencies are far too large to inline row_orders.extend([(False, lro) for lro in last_row_orders]) # We have established row orders for all datasets # at this point, quit the loop break else: # Update operation # Generate row orderings from existing row IDs row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) # TODO(sjperkins) # There's an assumption here that rowid is an # operation with minimal dependencies # (i.e. derived from xds_from_{ms, table}) # Caching flattens the graph into a single layer if len(row_order.__dask_graph__().layers) > 1: log.warning("Caching an update row ordering " "with more than one layer") row_order = cached_array(row_order) # Inline the row ordering in the graph row_orders.append((True, row_order)) assert len(row_orders) == len(datasets) datasets = [] for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders): # Hold the variables representing array writes write_vars = {} # Generate a dask array for each column for column in columns: try: variable = ds.data_vars[column] except KeyError: log.warning("Ignoring '%s' not present " "on dataset %d" % (column, di)) continue else: full_dims = variable.dims array = variable.data if not isinstance(array, da.Array): raise TypeError("%s on dataset %d is not a dask Array " "but a %s" % (column, di, type(array))) args = [row_order, ("row", )] # We only need to pass in dimension extent arrays if # there is more than one chunk in any of the non-row columns. # In that case, we can putcol, otherwise putcolslice is required if not all(len(c) == 1 for c in array.chunks[1:]): # Add extent arrays for d, c in zip(full_dims[1:], array.chunks[1:]): args.append(dim_extents_array(d, c)) args.append((d, )) # Add other variables args.extend([table_proxy, None, column, None, array, full_dims]) # Name of the dask array representing this column token = dask.base.tokenize(di, args) name = "-".join((table_name, 'write', column, token)) write_col = da.blockwise( putter_wrapper, full_dims, *args, # All dims shrink to 1, # a single bool is returned adjust_chunks={d: 1 for d in full_dims}, name=name, align_arrays=False, dtype=np.bool) if inline: write_col = inlined_array(write_col, [row_order]) write_vars[column] = (full_dims, write_col) # Append a dataset with the write operations datasets.append(Dataset(write_vars)) # Return an empty dataset if len(datasets) == 0: return Dataset({}) # Return singleton elif len(datasets) == 1: return datasets[0] return datasets
def cached_row_order(rowid): """ Produce a cached row_order array from the given rowid array. There's an assumption here that rowid is an operation with minimal dependencies (i.e. derived from xds_from_{ms, table}) Caching flattens the graph into one or two layers depending on whether standard or group ordering is requested Therfore, this functions warns if the rowid graph looks unusual, mostly because it'll be included in the cached row_order array, so we don't want it's graph to be too big or unusual. Parameters ---------- rowid : :class:`dask.array.Array` rowid array Returns ------- row_order : :class:`dask.array.Array` A array of row order tuples """ layers = rowid.__dask_graph__().layers # daskms.ordering.row_ordering case # or daskms.ordering.group_row_ordering case without rechunking # Check for standard layer if len(layers) == 1: layer_name = list(layers.keys())[0] if (not layer_name.startswith("row-") and not layer_name.startswith("group-rows-")): log.warning( "Unusual ROWID layer %s. " "This is probably OK but " "could foreshadow incorrect " "behaviour.", layer_name) # daskms.ordering.group_row_ordering case with rechunking # Check for standard layers elif len(layers) == 2: layer_names = list(sorted(layers.keys())) if not (layer_names[0].startswith('group-rows-') and layer_names[1].startswith('rechunk-merge-')): log.warning( "Unusual ROWID layers %s for " "the group ordering case. " "This is probably OK but " "could foreshadow incorrect " "behaviour.", layer_names) # ROWID has been extended or modified somehow, warn else: layer_names = list(sorted(layers.keys())) log.warning( "Unusual number of ROWID layers > 2 " "%s. This is probably OK but " "could foreshadow incorrect " "behaviour or sub-par performance if " "the ROWID graph is large.", layer_names) row_order = rowid.map_blocks(row_run_factory, sort_dir="write", dtype=np.object) return cached_array(row_order)