def _tensor_to_array(obj, pa_dtype): batch_size = obj.shape[0] element_shape = obj.shape[1:] total_elements = obj.size num_elements = 1 if len(obj.shape) == 1 else np.prod(element_shape) if isinstance(pa_dtype, ComplexType): flat_array = obj.view(obj.real.dtype).ravel() storage = pa.FixedSizeListArray.from_arrays(flat_array, 2) child_array = pa.ExtensionArray.from_storage(pa_dtype, storage) elif pa_dtype == pa.string(): child_array = pa.array(list(flatten(obj.tolist()))) else: child_buf = pa.py_buffer(obj) child_array = pa.Array.from_buffers(pa_dtype, total_elements, [None, child_buf]) offsets = np.int32([i * num_elements for i in range(batch_size + 1)]) offset_buf = pa.py_buffer(offsets) storage = pa.Array.from_buffers(pa.list_(pa_dtype), batch_size, [None, offset_buf], children=[child_array]) tensor_type = TensorType(element_shape, pa_dtype) return pa.ExtensionArray.from_storage(tensor_type, storage)
def cull(self, keys: Iterable) -> HighLevelGraph: """Return new HighLevelGraph with only the tasks required to calculate keys. In other words, remove unnecessary tasks from dask. Parameters ---------- keys iterable of keys or nested list of keys such as the output of ``__dask_keys__()`` Returns ------- hlg: HighLevelGraph Culled high level graph """ keys_set = set(flatten(keys)) all_ext_keys = self.get_all_external_keys() ret_layers: dict = {} ret_key_deps: dict = {} for layer_name in reversed(self._toposort_layers()): layer = self.layers[layer_name] # Let's cull the layer to produce its part of `keys`. # Note: use .intersection rather than & because the RHS is # a collections.abc.Set rather than a real set, and using & # would take time proportional to the size of the LHS, which # if there is no culling can be much bigger than the RHS. output_keys = keys_set.intersection(layer.get_output_keys()) if output_keys: culled_layer, culled_deps = layer.cull(output_keys, all_ext_keys) # Update `keys` with all layer's external key dependencies, which # are all the layer's dependencies (`culled_deps`) excluding # the layer's output keys. external_deps = set() for d in culled_deps.values(): external_deps |= d external_deps -= culled_layer.get_output_keys() keys_set |= external_deps # Save the culled layer and its key dependencies ret_layers[layer_name] = culled_layer ret_key_deps.update(culled_deps) # Converting dict_keys to a real set lets Python optimise the set # intersection to iterate over the smaller of the two sets. ret_layers_keys = set(ret_layers.keys()) ret_dependencies = { layer_name: self.dependencies[layer_name] & ret_layers_keys for layer_name in ret_layers } return HighLevelGraph(ret_layers, ret_dependencies, ret_key_deps)
def _npg_combine( x_chunk, agg: Aggregation, axis: Sequence, keepdims, group_ndim: int, ) -> IntermediateDict: """ Combine intermediates step of tree reduction. """ from dask.array.core import _concatenate2 from dask.base import flatten from dask.utils import deepmap if not isinstance(x_chunk, list): x_chunk = [x_chunk] unique_groups = np.unique( tuple( flatten( deepmap( lambda x: np.atleast_1d(x["groups"].squeeze()).tolist(), x_chunk)))) def reindex_intermediates(x): new_shape = x["groups"].shape[:-1] + (len(unique_groups), ) newx = {"groups": np.broadcast_to(unique_groups, new_shape)} newx["intermediates"] = tuple( reindex_( v, from_=x["groups"].squeeze(), to=unique_groups, fill_value=f) for v, f in zip(x["intermediates"], agg.fill_value.values())) return newx def _conc2(key1, key2=None, axis=None) -> np.ndarray: """ copied from dask.array.reductions.mean_combine""" if key2 is not None: mapped = deepmap(lambda x: x[key1][key2], x_chunk) else: mapped = deepmap(lambda x: x[key1], x_chunk) return _concatenate2(mapped, axes=axis) x_chunk = deepmap(reindex_intermediates, x_chunk) group_conc_axis: Iterable[int] if group_ndim == 1: group_conc_axis = (0, ) else: group_conc_axis = sorted(group_ndim - ax - 1 for ax in axis) groups = _conc2("groups", axis=group_conc_axis) if agg.reduction_type == "argreduce": # We need to send the intermediate array values & indexes at the same time # intermediates are (value e.g. max, index e.g. argmax, counts) array_idx = tuple( _conc2(key1="intermediates", key2=idx, axis=axis) for idx in (0, 1)) counts = _conc2(key1="intermediates", key2=2, axis=axis) results = chunk_argreduce( array_idx, groups, func=agg.combine[:-1], # count gets treated specially next axis=axis, expected_groups=None, fill_value=agg.fill_value, ) # sum the counts results["intermediates"].append( chunk_reduce( counts, groups, func="sum", axis=axis, expected_groups=None, fill_value={"sum": 0}, )["intermediates"][0]) elif agg.reduction_type == "reduce": # Here we reduce the intermediates individually results = {"groups": None, "intermediates": []} for idx, combine in enumerate(agg.combine): array = _conc2(key1="intermediates", key2=idx, axis=axis) if array.shape[-1] == 0: # all empty when combined results["intermediates"].append( np.empty(shape=(1, ) * (len(axis) - 1) + (0, ), dtype=array.dtype)) results["groups"] = np.empty( shape=(1, ) * (len(group_conc_axis) - 1) + (0, ), dtype=groups.dtype) else: _results = chunk_reduce( array, groups, func=combine, axis=axis, expected_groups=None, fill_value=agg.fill_value, ) results["intermediates"].append(*_results["intermediates"]) results["groups"] = _results["groups"] return results