Example #1
0
def _tensor_to_array(obj, pa_dtype):
    batch_size = obj.shape[0]
    element_shape = obj.shape[1:]
    total_elements = obj.size
    num_elements = 1 if len(obj.shape) == 1 else np.prod(element_shape)

    if isinstance(pa_dtype, ComplexType):
        flat_array = obj.view(obj.real.dtype).ravel()
        storage = pa.FixedSizeListArray.from_arrays(flat_array, 2)
        child_array = pa.ExtensionArray.from_storage(pa_dtype, storage)
    elif pa_dtype == pa.string():
        child_array = pa.array(list(flatten(obj.tolist())))
    else:
        child_buf = pa.py_buffer(obj)
        child_array = pa.Array.from_buffers(pa_dtype, total_elements,
                                            [None, child_buf])

    offsets = np.int32([i * num_elements for i in range(batch_size + 1)])
    offset_buf = pa.py_buffer(offsets)

    storage = pa.Array.from_buffers(pa.list_(pa_dtype),
                                    batch_size, [None, offset_buf],
                                    children=[child_array])

    tensor_type = TensorType(element_shape, pa_dtype)
    return pa.ExtensionArray.from_storage(tensor_type, storage)
Example #2
0
    def cull(self, keys: Iterable) -> HighLevelGraph:
        """Return new HighLevelGraph with only the tasks required to calculate keys.

        In other words, remove unnecessary tasks from dask.

        Parameters
        ----------
        keys
            iterable of keys or nested list of keys such as the output of
            ``__dask_keys__()``

        Returns
        -------
        hlg: HighLevelGraph
            Culled high level graph
        """
        keys_set = set(flatten(keys))

        all_ext_keys = self.get_all_external_keys()
        ret_layers: dict = {}
        ret_key_deps: dict = {}
        for layer_name in reversed(self._toposort_layers()):
            layer = self.layers[layer_name]
            # Let's cull the layer to produce its part of `keys`.
            # Note: use .intersection rather than & because the RHS is
            # a collections.abc.Set rather than a real set, and using &
            # would take time proportional to the size of the LHS, which
            # if there is no culling can be much bigger than the RHS.
            output_keys = keys_set.intersection(layer.get_output_keys())
            if output_keys:
                culled_layer, culled_deps = layer.cull(output_keys,
                                                       all_ext_keys)
                # Update `keys` with all layer's external key dependencies, which
                # are all the layer's dependencies (`culled_deps`) excluding
                # the layer's output keys.
                external_deps = set()
                for d in culled_deps.values():
                    external_deps |= d
                external_deps -= culled_layer.get_output_keys()
                keys_set |= external_deps

                # Save the culled layer and its key dependencies
                ret_layers[layer_name] = culled_layer
                ret_key_deps.update(culled_deps)

        # Converting dict_keys to a real set lets Python optimise the set
        # intersection to iterate over the smaller of the two sets.
        ret_layers_keys = set(ret_layers.keys())
        ret_dependencies = {
            layer_name: self.dependencies[layer_name] & ret_layers_keys
            for layer_name in ret_layers
        }

        return HighLevelGraph(ret_layers, ret_dependencies, ret_key_deps)
def _npg_combine(
    x_chunk,
    agg: Aggregation,
    axis: Sequence,
    keepdims,
    group_ndim: int,
) -> IntermediateDict:
    """ Combine intermediates step of tree reduction. """
    from dask.array.core import _concatenate2
    from dask.base import flatten
    from dask.utils import deepmap

    if not isinstance(x_chunk, list):
        x_chunk = [x_chunk]

    unique_groups = np.unique(
        tuple(
            flatten(
                deepmap(
                    lambda x: np.atleast_1d(x["groups"].squeeze()).tolist(),
                    x_chunk))))

    def reindex_intermediates(x):
        new_shape = x["groups"].shape[:-1] + (len(unique_groups), )
        newx = {"groups": np.broadcast_to(unique_groups, new_shape)}
        newx["intermediates"] = tuple(
            reindex_(
                v, from_=x["groups"].squeeze(), to=unique_groups, fill_value=f)
            for v, f in zip(x["intermediates"], agg.fill_value.values()))
        return newx

    def _conc2(key1, key2=None, axis=None) -> np.ndarray:
        """ copied from dask.array.reductions.mean_combine"""
        if key2 is not None:
            mapped = deepmap(lambda x: x[key1][key2], x_chunk)
        else:
            mapped = deepmap(lambda x: x[key1], x_chunk)
        return _concatenate2(mapped, axes=axis)

    x_chunk = deepmap(reindex_intermediates, x_chunk)

    group_conc_axis: Iterable[int]
    if group_ndim == 1:
        group_conc_axis = (0, )
    else:
        group_conc_axis = sorted(group_ndim - ax - 1 for ax in axis)
    groups = _conc2("groups", axis=group_conc_axis)

    if agg.reduction_type == "argreduce":
        # We need to send the intermediate array values & indexes at the same time
        # intermediates are (value e.g. max, index e.g. argmax, counts)
        array_idx = tuple(
            _conc2(key1="intermediates", key2=idx, axis=axis)
            for idx in (0, 1))
        counts = _conc2(key1="intermediates", key2=2, axis=axis)

        results = chunk_argreduce(
            array_idx,
            groups,
            func=agg.combine[:-1],  # count gets treated specially next
            axis=axis,
            expected_groups=None,
            fill_value=agg.fill_value,
        )

        # sum the counts
        results["intermediates"].append(
            chunk_reduce(
                counts,
                groups,
                func="sum",
                axis=axis,
                expected_groups=None,
                fill_value={"sum": 0},
            )["intermediates"][0])

    elif agg.reduction_type == "reduce":
        # Here we reduce the intermediates individually
        results = {"groups": None, "intermediates": []}
        for idx, combine in enumerate(agg.combine):
            array = _conc2(key1="intermediates", key2=idx, axis=axis)
            if array.shape[-1] == 0:
                # all empty when combined
                results["intermediates"].append(
                    np.empty(shape=(1, ) * (len(axis) - 1) + (0, ),
                             dtype=array.dtype))
                results["groups"] = np.empty(
                    shape=(1, ) * (len(group_conc_axis) - 1) + (0, ),
                    dtype=groups.dtype)
            else:
                _results = chunk_reduce(
                    array,
                    groups,
                    func=combine,
                    axis=axis,
                    expected_groups=None,
                    fill_value=agg.fill_value,
                )
                results["intermediates"].append(*_results["intermediates"])
                results["groups"] = _results["groups"]
    return results