Ejemplo n.º 1
0
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(core.flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    else:
        # Perform Blockwise optimizations for HLG input
        dsk = optimize_dataframe_getitem(dsk, keys=keys)
        dsk = optimize_blockwise(dsk, keys=keys)
        dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Do not perform low-level fusion unless the user has
    # specified True explicitly. The configuration will
    # be None by default.
    if not config.get("optimization.fuse.active"):
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    fuse_subgraphs = config.get("optimization.fuse.subgraphs")
    if fuse_subgraphs is None:
        fuse_subgraphs = True
    dsk, _ = fuse(
        dsk,
        keys,
        dependencies=dependencies,
        fuse_subgraphs=fuse_subgraphs,
    )
    dsk, _ = cull(dsk, keys)
    return dsk
Ejemplo n.º 2
0
def test_env_var_normalization(monkeypatch):
    value = 3
    monkeypatch.setenv('DASK_A_B', value)
    d = {}
    dask.config.refresh(config=d)
    assert get('a_b', config=d) == value
    assert get('a-b', config=d) == value
Ejemplo n.º 3
0
def test_env_var_normalization(monkeypatch):
    value = 3
    monkeypatch.setenv('DASK_A_B', value)
    d = {}
    dask.config.refresh(config=d)
    assert get('a_b', config=d) == value
    assert get('a-b', config=d) == value
Ejemplo n.º 4
0
def test_env_var_canonical_name(monkeypatch):
    value = 3
    monkeypatch.setenv("DASK_A_B", str(value))
    d = {}
    dask.config.refresh(config=d)
    assert get("a_b", config=d) == value
    assert get("a-b", config=d) == value
Ejemplo n.º 5
0
    def __init__(
        self,
        annotations: Mapping[str, Any] = None,
        collection_annotations: Mapping[str, Any] = None,
    ):
        """Initialize Layer object.

        Parameters
        ----------
        annotations : Mapping[str, Any], optional
            By default, None.
            Annotations are metadata or soft constraints associated with tasks
            that dask schedulers may choose to respect:
            They signal intent without enforcing hard constraints.
            As such, they are primarily designed for use with the distributed
            scheduler. See the dask.annotate function for more information.
        collection_annotations : Mapping[str, Any], optional. By default, None.
            Experimental, intended to assist with visualizing the performance
            characteristics of Dask computations.
            These annotations are *not* passed to the distributed scheduler.
        """
        self.annotations = annotations or copy.copy(
            config.get("annotations", None))
        self.collection_annotations = collection_annotations or copy.copy(
            config.get("collection_annotations", None))
Ejemplo n.º 6
0
def test_get():
    d = {"x": 1, "y": {"a": 2}}

    assert get("x", config=d) == 1
    assert get("y.a", config=d) == 2
    assert get("y.b", 123, config=d) == 123
    with pytest.raises(KeyError):
        get("y.b", config=d)
Ejemplo n.º 7
0
def test_get():
    d = {'x': 1, 'y': {'a': 2}}

    assert get('x', config=d) == 1
    assert get('y.a', config=d) == 2
    assert get('y.b', 123, config=d) == 123
    with pytest.raises(KeyError):
        get('y.b', config=d)
Ejemplo n.º 8
0
def test_get():
    d = {'x': 1, 'y': {'a': 2}}

    assert get('x', config=d) == 1
    assert get('y.a', config=d) == 2
    assert get('y.b', 123, config=d) == 123
    with pytest.raises(KeyError):
        get('y.b', config=d)
Ejemplo n.º 9
0
def test_custom_yaml(tmpdir):
    custom_config = {}
    custom_config["sql"] = dask_config.get("sql")
    custom_config["sql"]["groupby"]["split_out"] = 16
    custom_config["sql"]["foo"] = {"bar": [1, 2, 3], "baz": None}

    with open(os.path.join(tmpdir, "custom-sql.yaml"), mode="w") as f:
        yaml.dump(custom_config, f)

    dask_config.refresh(
        paths=[tmpdir])  # Refresh config to read from updated environment
    assert custom_config["sql"] == dask_config.get("sql")
    dask_config.refresh()
Ejemplo n.º 10
0
def rearrange_by_column(
    df,
    col,
    npartitions=None,
    max_branch=None,
    shuffle=None,
    compute=None,
    ignore_index=False,
):
    shuffle = shuffle or config.get("shuffle", None) or "disk"

    # if the requested output partitions < input partitions
    # we repartition first as shuffling overhead is
    # proportionate to the number of input partitions

    if npartitions is not None and npartitions < df.npartitions:
        df = df.repartition(npartitions=npartitions)

    if shuffle == "disk":
        return rearrange_by_column_disk(df, col, npartitions, compute=compute)
    elif shuffle == "tasks":
        df2 = rearrange_by_column_tasks(df,
                                        col,
                                        max_branch,
                                        npartitions,
                                        ignore_index=ignore_index)
        if ignore_index:
            df2._meta = df2._meta.reset_index(drop=True)
        return df2
    elif shuffle == "p2p":
        from distributed.shuffle import rearrange_by_column_p2p

        return rearrange_by_column_p2p(df, col, npartitions)
    else:
        raise NotImplementedError("Unknown shuffle method %s" % shuffle)
Ejemplo n.º 11
0
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs):
    """
    Convert many collections into a single dask graph, after optimization
    """
    from dask.highlevelgraph import HighLevelGraph

    optimizations = tuple(optimizations) + tuple(config.get("optimizations", ()))

    if optimize_graph:
        groups = groupby(optimization_function, collections)

        graphs = []
        for opt, val in groups.items():
            dsk, keys = _extract_graph_and_keys(val)
            dsk = opt(dsk, keys, **kwargs)

            for opt_inner in optimizations:
                dsk = opt_inner(dsk, keys, **kwargs)

            graphs.append(dsk)

        # Merge all graphs
        if any(isinstance(graph, HighLevelGraph) for graph in graphs):
            dsk = HighLevelGraph.merge(*graphs)
        else:
            dsk = merge(*map(ensure_dict, graphs))
    else:
        dsk, _ = _extract_graph_and_keys(collections)

    return dsk
Ejemplo n.º 12
0
def test_default_config():
    config_fn = os.path.join(os.path.dirname(__file__), "../../dask_sql",
                             "sql.yaml")
    with open(config_fn) as f:
        default_config = yaml.safe_load(f)
    assert "sql" in default_config
    assert default_config["sql"] == dask_config.get("sql")
Ejemplo n.º 13
0
def aggregate_chunks(existing_chunks: Iterable[int],
                     item_size: int,
                     subdivision: int = 1):
    target_size_bytes = int(
        Quantity(config.get("array.chunk-size")).m_as("bytes"))

    # The optimal number of data per Dask chunk.
    target_size = target_size_bytes // item_size

    # Try to aggregate the input data into the fewest possible Dask chunks.
    new_chunks = []
    for chunk in existing_chunks:
        # If this input data set will fit into the current chunk, add it.
        if new_chunks and new_chunks[-1] + chunk <= target_size:
            new_chunks[-1] += chunk
        # If the current chunk is full (or the chunks list is empty), add this
        # data set to the next chunk.
        elif chunk <= target_size:
            new_chunks.append(chunk)
        # If this data set is larger than the max Dask chunk size, split it
        # along the HDF5 data set chunk boundaries and put the pieces in
        # separate Dask chunks.
        else:
            n_whole_chunks, remainder = divmod(chunk, target_size)
            dask_chunk_size = target_size // subdivision * subdivision
            new_chunks += [dask_chunk_size] * n_whole_chunks + [remainder]

    return new_chunks
Ejemplo n.º 14
0
def get_features_kwarg(
    name: str,
    scheduler: Optional[str] = None,
    queue_type: Optional[str] = None,
    default: Optional[Any] = None,
) -> Optional[Any]:
    """Searches in the jobqueue_features config for a value for kw_name.

    Args:
        name: The key to search for in config.
        scheduler: The name of scheduler's config for which search is taken.
        queue_type: The queue type to search for in config.
        default: A default value to give if nothing in config files.

    Returns:
        Found value or the default value.
    """
    value = None
    # search for kw_name from bottom up queue_type -> scheduler -> jobqueue_features

    # Error checking
    if not isinstance(name, str):
        raise ValueError('"name" must be a string')
    # if scheduler is None and queue_type is not None:
    if scheduler is None and queue_type is not None:
        raise ValueError(
            "Cannot search in queue_type without providing a scheduler")

    # Now do the config search
    # use default=None in calls since we set defaults ourselves
    if scheduler is not None and queue_type is not None:
        value = config.get(
            "jobqueue-features.{}.queue-type.{}.{}".format(
                scheduler, queue_type, name),
            default=None,
        )
    if value is None and scheduler is not None:
        value = config.get("jobqueue-features.{}.{}".format(scheduler, name),
                           default=None)
    if value is None:
        value = config.get("jobqueue-features.{}".format(name), default=None)
    if value is None and default is not None:
        value = default
    return value
Ejemplo n.º 15
0
def test_getitem_avoids_large_chunks():
    a = np.arange(4 * 500 * 500).reshape(4, 500, 500)
    arr = da.from_array(a, chunks=(1, 500, 500))
    indexer = [0, 1] + [2] * 100 + [3]
    result = arr[indexer]
    chunk_size = utils.parse_bytes(config.get("array.chunk-size"))
    assert all(x.nbytes < chunk_size for x in result.blocks)
    expected = a[indexer]

    assert_eq(result, expected)
Ejemplo n.º 16
0
def optimize(
    dsk,
    keys,
    fuse_keys=None,
    fast_functions=None,
    inline_functions_fast_functions=(getter_inline,),
    rename_fused_keys=True,
    **kwargs,
):
    """Optimize dask for array computation

    1.  Cull tasks not necessary to evaluate keys
    2.  Remove full slicing, e.g. x[:]
    3.  Inline fast functions like getitem and np.transpose
    """
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())

    dsk = optimize_blockwise(dsk, keys=keys)
    dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    # Low level task optimizations
    if fast_functions is not None:
        inline_functions_fast_functions = fast_functions

    hold = hold_keys(dsk, dependencies)

    dsk, dependencies = fuse(
        dsk,
        hold + keys + (fuse_keys or []),
        dependencies,
        rename_keys=rename_fused_keys,
    )
    if inline_functions_fast_functions:
        dsk = inline_functions(
            dsk,
            keys,
            dependencies=dependencies,
            fast_functions=inline_functions_fast_functions,
        )

    return optimize_slices(dsk)
Ejemplo n.º 17
0
def safe_file_url(url, start=None):
    """Formats an URL so that it meets the following safety conditions:

    - the URL starts with file:// (else: raises NotImplementedError)
    - the path is absolute (relative paths are taken relative to
      geomodeling.root)
    - if geomodeling.strict_paths: the path has to be contained inside
      `start` (else: raises IOError)

    For backwards compatibility, geomodeling.root can be overriden using the
    'start' argument.
    """
    try:
        protocol, path = url.split("://")
    except ValueError:
        protocol = "file"
        path = url
    else:
        if protocol != "file":
            raise NotImplementedError(
                'Unknown protocol: "{}"'.format(protocol))
    if start is not None:
        warnings.warn(
            "Using the start argument in safe_file_url is deprecated. Use the "
            "'geomodeling.root' in the dask config",
            DeprecationWarning,
        )
    else:
        start = config.get("geomodeling.root")

    if not os.path.isabs(path):
        if start is None:
            raise IOError(
                "Relative path '{}' provided but start was not given.".format(
                    path))
        abspath = os.path.abspath(os.path.join(start, path))
    else:
        abspath = os.path.abspath(path)
    strict = config.get("geomodeling.strict-file-paths")
    if strict and not abspath.startswith(start):
        raise IOError("'{}' is not contained in '{}'".format(path, start))
    return "://".join([protocol, abspath])
Ejemplo n.º 18
0
def _set_metadata_task_size(metadata_task_size, fs):
    # Set metadata_task_size using the config file
    # if the kwarg value was not specified
    if metadata_task_size is None:
        # If a default value is not specified in the config file,
        # otherwise we use "0"
        config_str = "dataframe.parquet.metadata-task-size-" + (
            "local" if _is_local_fs(fs) else "remote")
        return config.get(config_str, 0)

    return metadata_task_size
Ejemplo n.º 19
0
 def normalize_array(x):
     if not x.shape:
         return (x.item(), x.dtype)
     if hasattr(x, "mode") and getattr(x, "filename", None):
         if hasattr(x.base, "ctypes"):
             offset = (
                 x.ctypes._as_parameter_.value - x.base.ctypes._as_parameter_.value
             )
         else:
             offset = 0  # root memmap's have mmap object as base
         if hasattr(
             x, "offset"
         ):  # offset numpy used while opening, and not the offset to the beginning of the file
             offset += getattr(x, "offset")
         return (
             x.filename,
             os.path.getmtime(x.filename),
             x.dtype,
             x.shape,
             x.strides,
             offset,
         )
     if x.dtype.hasobject:
         try:
             try:
                 # string fast-path
                 data = hash_buffer_hex(
                     "-".join(x.flat).encode(
                         encoding="utf-8", errors="surrogatepass"
                     )
                 )
             except UnicodeDecodeError:
                 # bytes fast-path
                 data = hash_buffer_hex(b"-".join(x.flat))
         except (TypeError, UnicodeDecodeError):
             try:
                 data = hash_buffer_hex(pickle.dumps(x, pickle.HIGHEST_PROTOCOL))
             except Exception:
                 # pickling not supported, use UUID4-based fallback
                 if not config.get("tokenize.ensure-deterministic"):
                     data = uuid.uuid4().hex
                 else:
                     raise RuntimeError(
                         f"``np.ndarray`` with object ``dtype`` {str(x)} cannot "
                         "be deterministically hashed. Please, see "
                         "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing "  # noqa: E501
                         "for more information"
                     )
     else:
         try:
             data = hash_buffer_hex(x.ravel(order="K").view("i1"))
         except (BufferError, AttributeError, ValueError):
             data = hash_buffer_hex(x.copy().ravel(order="K").view("i1"))
     return (data, x.dtype, x.shape, x.strides)
Ejemplo n.º 20
0
def get_context():
    """Return the current multiprocessing context."""
    # fork context does fork()-without-exec(), which can lead to deadlocks,
    # so default to "spawn".
    context_name = config.get("multiprocessing.context", "spawn")
    if sys.platform == "win32":
        if context_name != "spawn":
            # Only spawn is supported on Win32, can't change it:
            warn(_CONTEXT_UNSUPPORTED, UserWarning)
        return multiprocessing
    else:
        return multiprocessing.get_context(context_name)
Ejemplo n.º 21
0
def get_cluster(scheduler: Optional[str] = None, **kwargs) -> "ClusterType":
    if scheduler is None:
        scheduler = config.get("jobqueue-features.scheduler", default=None)
    if scheduler is None:
        raise ValueError("You must configure a scheduler either via a kwarg"
                         " or in your configuration file")
    if scheduler == SLURM:
        return CustomSLURMCluster(**kwargs)
    else:
        raise NotImplementedError(
            "Scheduler {} is not in list of supported schedulers: {}".format(
                scheduler, SUPPORTED_SCHEDULERS))
Ejemplo n.º 22
0
    def _update_kwargs_job_extra(self, **kwargs) -> Dict[str, Any]:
        job_extra = kwargs.get("job_extra", self.get_kwarg("job-extra"))
        if job_extra is None:
            job_extra = config.get(
                "jobqueue.{}.job_extra".format(self.scheduler_name), default=[]
            )

        # order matters, to ensure user has power to be in control make sure their
        # settings come last
        final_job_extra = self.gpu_job_extra
        final_job_extra.extend(job_extra)
        kwargs.update({"job_extra": final_job_extra})
        return kwargs
Ejemplo n.º 23
0
def _normalize_seq_func(seq):
    # Defined outside normalize_seq to avoid unneccessary redefinitions and
    # therefore improving computation times.
    try:
        return list(map(normalize_token, seq))
    except RecursionError:
        if not config.get("tokenize.ensure-deterministic"):
            return uuid.uuid4().hex

        raise RuntimeError(
            f"Sequence {str(seq)} cannot be deterministically hashed. Please, see "
            "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing "
            "for more information"
        )
Ejemplo n.º 24
0
 def _update_kwargs_env_extra(self, **kwargs) -> Dict[str, Any]:
     if self.openmp_env_extra is None:
         return kwargs
     env_extra = kwargs.get("env_extra", self.get_kwarg("env-extra"))
     if not env_extra:
         env_extra = config.get(
             "jobqueue.{}.env_extra".format(self.scheduler_name), default=[]
         )
     # order matters, make sure user has power to be in control, explicit user set
     # stuff comes last
     final_env_extra = self.openmp_env_extra
     final_env_extra.extend(env_extra)
     kwargs.update({"env_extra": final_env_extra})
     return kwargs
Ejemplo n.º 25
0
def get_features_kwarg(name, scheduler=None, queue_type=None, default=None):
    """
    Search in the jobqueue_features config for a value for kw_name
    :param scheduler: scheduler name to search for in configuration
    :param name: string to search for in configuration
    :param queue_type: queue type to search for in config
    :param default: default value to give if nothing in config files
    :return: value or None
    """
    value = None
    # search for kw_name from bottom up queue_type -> scheduler -> jobqueue_features

    # Error checking
    if not isinstance(name, str):
        raise ValueError('"name" must be a string')
    # if scheduler is None and queue_type is not None:
    if scheduler is None and queue_type is not None:
        raise ValueError(
            "Cannot search in queue_type without providing a scheduler")

    # Now do the config search
    # use default=None in calls since we set defaults ourselves
    if scheduler is not None and queue_type is not None:
        value = config.get(
            "jobqueue-features.{}.queue-type.{}.{}".format(
                scheduler, queue_type, name),
            default=None,
        )
    if value is None and scheduler is not None:
        value = config.get("jobqueue-features.{}.{}".format(scheduler, name),
                           default=None)
    if value is None:
        value = config.get("jobqueue-features.{}".format(name), default=None)
    if value is None and default is not None:
        value = default
    return value
Ejemplo n.º 26
0
    def get_sources_and_requests(self, **request):
        # first handle the 'time' and 'meta' requests
        mode = request["mode"]
        if mode == "time":
            return [(self.period[-1], None), ({"mode": "time"}, None)]
        elif mode == "meta":
            return [(None, None), ({"mode": "meta"}, None)]
        elif mode != "vals":
            raise ValueError("Unknown mode '{}'".format(mode))

        # build the request to be sent to the geometry source
        x1, y1, x2, y2 = request["bbox"]
        width, height = request["width"], request["height"]

        # be strict about the bbox, it may lead to segfaults else
        if x2 == x1 and y2 == y1:  # point
            min_size = None
        elif x1 < x2 and y1 < y2:
            min_size = min((x2 - x1) / width, (y2 - y1) / height)
        else:
            raise ValueError("Invalid bbox ({})".format(request["bbox"]))

        limit = self.limit
        if self.limit is None:
            limit = config.get("geomodeling.geometry-limit")

        geom_request = {
            "mode": "intersects",
            "geometry": box(*request["bbox"]),
            "projection": request["projection"],
            "min_size": min_size,
            "limit": limit,
            "start": request.get("start"),
            "stop": request.get("stop"),
        }
        # keep some variables for use in process()
        process_kwargs = {
            "mode": "vals",
            "column_name": self.column_name,
            "dtype": self.dtype,
            "no_data_value": self.fillvalue,
            "width": width,
            "height": height,
            "bbox": request["bbox"],
        }
        return [(self.source, geom_request), (process_kwargs, None)]
Ejemplo n.º 27
0
def tokenize(*args, pure=None, **kwargs):
    """Mapping function from task -> consistent name.

    Parameters
    ----------
    args : object
        Python objects that summarize the task.
    pure : boolean, optional
        If True, a consistent hash function is tried on the input. If this
        fails, then a unique identifier is used. If False (default), then a
        unique identifier is always used.
    """
    if pure is None:
        pure = config.get("delayed_pure", False)

    if pure:
        return _tokenize(*args, **kwargs)
    else:
        return str(uuid.uuid4())
Ejemplo n.º 28
0
def normalize_object(o):
    method = getattr(o, "__dask_tokenize__", None)
    if method is not None:
        return method()

    if callable(o):
        return normalize_function(o)

    if dataclasses.is_dataclass(o):
        return normalize_dataclass(o)

    if not config.get("tokenize.ensure-deterministic"):
        return uuid.uuid4().hex

    raise RuntimeError(
        f"Object {str(o)} cannot be deterministically hashed. Please, see "
        "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing "
        "for more information"
    )
Ejemplo n.º 29
0
def filter_or_scalar(df: dd.DataFrame, filter_condition: Union[np.bool_,
                                                               dd.Series]):
    """
    Some (complex) SQL queries can lead to a strange condition which is always true or false.
    We do not need to filter in this case.
    See https://github.com/dask-contrib/dask-sql/issues/87.
    """
    if np.isscalar(filter_condition):
        if not filter_condition:  # pragma: no cover
            # empty dataset
            logger.warning(
                "Join condition is always false - returning empty dataset")
            return df.head(0, compute=False)
        else:
            return df

    # In SQL, a NULL in a boolean is False on filtering
    filter_condition = filter_condition.fillna(False)
    out = df[filter_condition]
    if dask_config.get("sql.predicate_pushdown"):
        return attempt_predicate_pushdown(out)
    else:
        return out
Ejemplo n.º 30
0
def get_async(
    submit,
    num_workers,
    dsk,
    result,
    cache=None,
    get_id=default_get_id,
    rerun_exceptions_locally=None,
    pack_exception=default_pack_exception,
    raise_exception=reraise,
    callbacks=None,
    dumps=identity,
    loads=identity,
    chunksize=None,
    **kwargs,
):
    """Asynchronous get function

    This is a general version of various asynchronous schedulers for dask.  It
    takes a ``concurrent.futures.Executor.submit`` function to form a more
    specific ``get`` method that walks through the dask array with parallel
    workers, avoiding repeat computation and minimizing memory use.

    Parameters
    ----------
    submit : function
        A ``concurrent.futures.Executor.submit`` function
    num_workers : int
        The number of workers that task submissions can be spread over
    dsk : dict
        A dask dictionary specifying a workflow
    result : key or list of keys
        Keys corresponding to desired data
    cache : dict-like, optional
        Temporary storage of results
    get_id : callable, optional
        Function to return the worker id, takes no arguments. Examples are
        `threading.current_thread` and `multiprocessing.current_process`.
    rerun_exceptions_locally : bool, optional
        Whether to rerun failing tasks in local process to enable debugging
        (False by default)
    pack_exception : callable, optional
        Function to take an exception and ``dumps`` method, and return a
        serialized tuple of ``(exception, traceback)`` to send back to the
        scheduler. Default is to just raise the exception.
    raise_exception : callable, optional
        Function that takes an exception and a traceback, and raises an error.
    callbacks : tuple or list of tuples, optional
        Callbacks are passed in as tuples of length 5. Multiple sets of
        callbacks may be passed in as a list of tuples. For more information,
        see the dask.diagnostics documentation.
    dumps: callable, optional
        Function to serialize task data and results to communicate between
        worker and parent.  Defaults to identity.
    loads: callable, optional
        Inverse function of `dumps`.  Defaults to identity.
    chunksize: int, optional
        Size of chunks to use when dispatching work. Defaults to 1.
        If -1, will be computed to evenly divide ready work across workers.

    See Also
    --------
    threaded.get
    """
    chunksize = chunksize or config.get("chunksize", 1)

    queue = Queue()

    if isinstance(result, list):
        result_flat = set(flatten(result))
    else:
        result_flat = {result}
    results = set(result_flat)

    dsk = dict(dsk)
    with local_callbacks(callbacks) as callbacks:
        _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks)
        started_cbs = []
        succeeded = False
        # if start_state_from_dask fails, we will have something
        # to pass to the final block.
        state = {}
        try:
            for cb in callbacks:
                if cb[0]:
                    cb[0](dsk)
                started_cbs.append(cb)

            keyorder = order(dsk)

            state = start_state_from_dask(dsk,
                                          cache=cache,
                                          sortkey=keyorder.get)

            for _, start_state, _, _, _ in callbacks:
                if start_state:
                    start_state(dsk, state)

            if rerun_exceptions_locally is None:
                rerun_exceptions_locally = config.get(
                    "rerun_exceptions_locally", False)

            if state["waiting"] and not state["ready"]:
                raise ValueError("Found no accessible jobs in dask")

            def fire_tasks(chunksize):
                """Fire off a task to the thread pool"""
                # Determine chunksize and/or number of tasks to submit
                nready = len(state["ready"])
                if chunksize == -1:
                    ntasks = nready
                    chunksize = -(ntasks // -num_workers)
                else:
                    used_workers = -(len(state["running"]) // -chunksize)
                    avail_workers = max(num_workers - used_workers, 0)
                    ntasks = min(nready, chunksize * avail_workers)

                # Prep all ready tasks for submission
                args = []
                for _ in range(ntasks):
                    # Get the next task to compute (most recently added)
                    key = state["ready"].pop()
                    # Notify task is running
                    state["running"].add(key)
                    for f in pretask_cbs:
                        f(key, dsk, state)

                    # Prep args to send
                    data = {
                        dep: state["cache"][dep]
                        for dep in get_dependencies(dsk, key)
                    }
                    args.append((
                        key,
                        dumps((dsk[key], data)),
                        dumps,
                        loads,
                        get_id,
                        pack_exception,
                    ))

                # Batch submit
                for i in range(-(len(args) // -chunksize)):
                    each_args = args[i * chunksize:(i + 1) * chunksize]
                    if not each_args:
                        break
                    fut = submit(batch_execute_tasks, each_args)
                    fut.add_done_callback(queue.put)

            # Main loop, wait on tasks to finish, insert new ones
            while state["waiting"] or state["ready"] or state["running"]:
                fire_tasks(chunksize)
                for key, res_info, failed in queue_get(queue).result():
                    if failed:
                        exc, tb = loads(res_info)
                        if rerun_exceptions_locally:
                            data = {
                                dep: state["cache"][dep]
                                for dep in get_dependencies(dsk, key)
                            }
                            task = dsk[key]
                            _execute_task(task, data)  # Re-execute locally
                        else:
                            raise_exception(exc, tb)
                    res, worker_id = loads(res_info)
                    state["cache"][key] = res
                    finish_task(dsk, key, state, results, keyorder.get)
                    for f in posttask_cbs:
                        f(key, res, dsk, state, worker_id)

            succeeded = True

        finally:
            for _, _, _, _, finish in started_cbs:
                if finish:
                    finish(dsk, state, not succeeded)

    return nested_get(result, state["cache"])
Ejemplo n.º 31
0
def start_state_from_dask(dsk, cache=None, sortkey=None):
    """Start state from a dask

    Examples
    --------
    >>> inc = lambda x: x + 1
    >>> add = lambda x, y: x + y
    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}  # doctest: +SKIP
    >>> from pprint import pprint  # doctest: +SKIP
    >>> pprint(start_state_from_dask(dsk))  # doctest: +SKIP
    {'cache': {'x': 1, 'y': 2},
     'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}},
     'dependents': defaultdict(None, {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}}),
     'finished': set(),
     'ready': ['z'],
     'released': set(),
     'running': set(),
     'waiting': {'w': {'z'}},
     'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}}
    """
    if sortkey is None:
        sortkey = order(dsk).get
    if cache is None:
        cache = config.get("cache", None)
    if cache is None:
        cache = dict()
    data_keys = set()
    for k, v in dsk.items():
        if not has_tasks(dsk, v):
            cache[k] = v
            data_keys.add(k)

    dsk2 = dsk.copy()
    dsk2.update(cache)

    dependencies = {k: get_dependencies(dsk2, k) for k in dsk}
    waiting = {
        k: v.copy()
        for k, v in dependencies.items() if k not in data_keys
    }

    dependents = reverse_dict(dependencies)
    for a in cache:
        for b in dependents.get(a, ()):
            waiting[b].remove(a)
    waiting_data = {k: v.copy() for k, v in dependents.items() if v}

    ready_set = {k for k, v in waiting.items() if not v}
    ready = sorted(ready_set, key=sortkey, reverse=True)
    waiting = {k: v for k, v in waiting.items() if v}

    state = {
        "dependencies": dependencies,
        "dependents": dependents,
        "waiting": waiting,
        "waiting_data": waiting_data,
        "cache": cache,
        "ready": ready,
        "running": set(),
        "finished": set(),
        "released": set(),
    }

    return state
Ejemplo n.º 32
0
def fuse(
    dsk,
    keys=None,
    dependencies=None,
    ave_width=_default,
    max_width=_default,
    max_height=_default,
    max_depth_new_edges=_default,
    rename_keys=_default,
    fuse_subgraphs=_default,
):
    """Fuse tasks that form reductions; more advanced than ``fuse_linear``

    This trades parallelism opportunities for faster scheduling by making tasks
    less granular.  It can replace ``fuse_linear`` in optimization passes.

    This optimization applies to all reductions--tasks that have at most one
    dependent--so it may be viewed as fusing "multiple input, single output"
    groups of tasks into a single task.  There are many parameters to fine
    tune the behavior, which are described below.  ``ave_width`` is the
    natural parameter with which to compare parallelism to granularity, so
    it should always be specified.  Reasonable values for other parameters
    will be determined using ``ave_width`` if necessary.

    Parameters
    ----------
    dsk: dict
        dask graph
    keys: list or set, optional
        Keys that must remain in the returned dask graph
    dependencies: dict, optional
        {key: [list-of-keys]}.  Must be a list to provide count of each key
        This optional input often comes from ``cull``
    ave_width: float (default 1)
        Upper limit for ``width = num_nodes / height``, a good measure of
        parallelizability.
        dask.config key: ``optimization.fuse.ave-width``
    max_width: int (default infinite)
        Don't fuse if total width is greater than this.
        dask.config key: ``optimization.fuse.max-width``
    max_height: int or None (default None)
        Don't fuse more than this many levels. Set to None to dynamically
        adjust to ``1.5 + ave_width * log(ave_width + 1)``.
        dask.config key: ``optimization.fuse.max-height``
    max_depth_new_edges: int or None (default None)
        Don't fuse if new dependencies are added after this many levels.
        Set to None to dynamically adjust to ave_width * 1.5.
        dask.config key: ``optimization.fuse.max-depth-new-edges``
    rename_keys: bool or func, optional (default True)
        Whether to rename the fused keys with ``default_fused_keys_renamer``
        or not.  Renaming fused keys can keep the graph more understandable
        and comprehensive, but it comes at the cost of additional processing.
        If False, then the top-most key will be used.  For advanced usage, a
        function to create the new name is also accepted.
        dask.config key: ``optimization.fuse.rename-keys``
    fuse_subgraphs : bool or None, optional (default None)
        Whether to fuse multiple tasks into ``SubgraphCallable`` objects.
        Set to None to let the default optimizer of individual dask collections decide.
        If no collection-specific default exists, None defaults to False.
        dask.config key: ``optimization.fuse.subgraphs``

    Returns
    -------
    dsk
        output graph with keys fused
    dependencies
        dict mapping dependencies after fusion.  Useful side effect to accelerate other
        downstream optimizations.
    """

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk, dependencies

    if keys is not None and not isinstance(keys, set):
        if not isinstance(keys, list):
            keys = [keys]
        keys = set(flatten(keys))

    # Read defaults from dask.yaml and/or user-defined config file
    if ave_width is _default:
        ave_width = config.get("optimization.fuse.ave-width")
        assert ave_width is not _default
    if max_height is _default:
        max_height = config.get("optimization.fuse.max-height")
        assert max_height is not _default
    if max_depth_new_edges is _default:
        max_depth_new_edges = config.get("optimization.fuse.max-depth-new-edges")
        assert max_depth_new_edges is not _default
    if max_depth_new_edges is None:
        max_depth_new_edges = ave_width * 1.5
    if max_width is _default:
        max_width = config.get("optimization.fuse.max-width")
        assert max_width is not _default
    if max_width is None:
        max_width = 1.5 + ave_width * math.log(ave_width + 1)
    if fuse_subgraphs is _default:
        fuse_subgraphs = config.get("optimization.fuse.subgraphs")
        assert fuse_subgraphs is not _default
    if fuse_subgraphs is None:
        fuse_subgraphs = False

    if not ave_width or not max_height:
        return dsk, dependencies

    if rename_keys is _default:
        rename_keys = config.get("optimization.fuse.rename-keys")
        assert rename_keys is not _default
    if rename_keys is True:
        key_renamer = default_fused_keys_renamer
    elif rename_keys is False:
        key_renamer = None
    elif not callable(rename_keys):
        raise TypeError("rename_keys must be a boolean or callable")
    else:
        key_renamer = rename_keys
    rename_keys = key_renamer is not None

    if dependencies is None:
        deps = {k: get_dependencies(dsk, k, as_list=True) for k in dsk}
    else:
        deps = dict(dependencies)

    rdeps = {}
    for k, vals in deps.items():
        for v in vals:
            if v not in rdeps:
                rdeps[v] = [k]
            else:
                rdeps[v].append(k)
        deps[k] = set(vals)

    reducible = {k for k, vals in rdeps.items() if len(vals) == 1}
    if keys:
        reducible -= keys

    for k, v in dsk.items():
        if type(v) is not tuple and not isinstance(v, (numbers.Number, str)):
            reducible.discard(k)

    if not reducible and (
        not fuse_subgraphs or all(len(set(v)) != 1 for v in rdeps.values())
    ):
        # Quick return if there's nothing to do. Only progress if there's tasks
        # fusible by the main `fuse`, or by `fuse_subgraphs` if enabled.
        return dsk, deps

    rv = dsk.copy()
    fused_trees = {}
    # These are the stacks we use to store data as we traverse the graph
    info_stack = []
    children_stack = []
    # For speed
    deps_pop = deps.pop
    reducible_add = reducible.add
    reducible_pop = reducible.pop
    reducible_remove = reducible.remove
    fused_trees_pop = fused_trees.pop
    info_stack_append = info_stack.append
    info_stack_pop = info_stack.pop
    children_stack_append = children_stack.append
    children_stack_extend = children_stack.extend
    children_stack_pop = children_stack.pop
    while reducible:
        parent = reducible_pop()
        reducible_add(parent)
        while parent in reducible:
            # Go to the top
            parent = rdeps[parent][0]
        children_stack_append(parent)
        children_stack_extend(reducible & deps[parent])
        while True:
            child = children_stack[-1]
            if child != parent:
                children = reducible & deps[child]
                while children:
                    # Depth-first search
                    children_stack_extend(children)
                    parent = child
                    child = children_stack[-1]
                    children = reducible & deps[child]
                children_stack_pop()
                # This is a leaf node in the reduction region
                # key, task, fused_keys, height, width, number of nodes, fudge, set of edges
                info_stack_append(
                    (
                        child,
                        rv[child],
                        [child] if rename_keys else None,
                        1,
                        1,
                        1,
                        0,
                        deps[child] - reducible,
                    )
                )
            else:
                children_stack_pop()
                # Calculate metrics and fuse as appropriate
                deps_parent = deps[parent]
                edges = deps_parent - reducible
                children = deps_parent - edges
                num_children = len(children)

                if num_children == 1:
                    (
                        child_key,
                        child_task,
                        child_keys,
                        height,
                        width,
                        num_nodes,
                        fudge,
                        children_edges,
                    ) = info_stack_pop()
                    num_children_edges = len(children_edges)

                    if fudge > num_children_edges - 1 >= 0:
                        fudge = num_children_edges - 1
                    edges |= children_edges
                    no_new_edges = len(edges) == num_children_edges
                    if not no_new_edges:
                        fudge += 1
                    if (
                        (num_nodes + fudge) / height <= ave_width
                        and
                        # Sanity check; don't go too deep if new levels introduce new edge dependencies
                        (no_new_edges or height < max_depth_new_edges)
                    ):
                        # Perform substitutions as we go
                        val = subs(dsk[parent], child_key, child_task)
                        deps_parent.remove(child_key)
                        deps_parent |= deps_pop(child_key)
                        del rv[child_key]
                        reducible_remove(child_key)
                        if rename_keys:
                            child_keys.append(parent)
                            fused_trees[parent] = child_keys
                            fused_trees_pop(child_key, None)

                        if children_stack:
                            if no_new_edges:
                                # Linear fuse
                                info_stack_append(
                                    (
                                        parent,
                                        val,
                                        child_keys,
                                        height,
                                        width,
                                        num_nodes,
                                        fudge,
                                        edges,
                                    )
                                )
                            else:
                                info_stack_append(
                                    (
                                        parent,
                                        val,
                                        child_keys,
                                        height + 1,
                                        width,
                                        num_nodes + 1,
                                        fudge,
                                        edges,
                                    )
                                )
                        else:
                            rv[parent] = val
                            break
                    else:
                        rv[child_key] = child_task
                        reducible_remove(child_key)
                        if children_stack:
                            # Allow the parent to be fused, but only under strict circumstances.
                            # Ensure that linear chains may still be fused.
                            if fudge > int(ave_width - 1):
                                fudge = int(ave_width - 1)
                            # This task *implicitly* depends on `edges`
                            info_stack_append(
                                (
                                    parent,
                                    rv[parent],
                                    [parent] if rename_keys else None,
                                    1,
                                    width,
                                    1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            break
                else:
                    child_keys = []
                    height = 1
                    width = 0
                    num_single_nodes = 0
                    num_nodes = 0
                    fudge = 0
                    children_edges = set()
                    max_num_edges = 0
                    children_info = info_stack[-num_children:]
                    del info_stack[-num_children:]
                    for (
                        cur_key,
                        cur_task,
                        cur_keys,
                        cur_height,
                        cur_width,
                        cur_num_nodes,
                        cur_fudge,
                        cur_edges,
                    ) in children_info:
                        if cur_height == 1:
                            num_single_nodes += 1
                        elif cur_height > height:
                            height = cur_height
                        width += cur_width
                        num_nodes += cur_num_nodes
                        fudge += cur_fudge
                        if len(cur_edges) > max_num_edges:
                            max_num_edges = len(cur_edges)
                        children_edges |= cur_edges
                    # Fudge factor to account for possible parallelism with the boundaries
                    num_children_edges = len(children_edges)
                    fudge += min(
                        num_children - 1, max(0, num_children_edges - max_num_edges)
                    )

                    if fudge > num_children_edges - 1 >= 0:
                        fudge = num_children_edges - 1
                    edges |= children_edges
                    no_new_edges = len(edges) == num_children_edges
                    if not no_new_edges:
                        fudge += 1
                    if (
                        (num_nodes + fudge) / height <= ave_width
                        and num_single_nodes <= ave_width
                        and width <= max_width
                        and height <= max_height
                        and
                        # Sanity check; don't go too deep if new levels introduce new edge dependencies
                        (no_new_edges or height < max_depth_new_edges)
                    ):
                        # Perform substitutions as we go
                        val = dsk[parent]
                        children_deps = set()
                        for child_info in children_info:
                            cur_child = child_info[0]
                            val = subs(val, cur_child, child_info[1])
                            del rv[cur_child]
                            children_deps |= deps_pop(cur_child)
                            reducible_remove(cur_child)
                            if rename_keys:
                                fused_trees_pop(cur_child, None)
                                child_keys.extend(child_info[2])
                        deps_parent -= children
                        deps_parent |= children_deps

                        if rename_keys:
                            child_keys.append(parent)
                            fused_trees[parent] = child_keys

                        if children_stack:
                            info_stack_append(
                                (
                                    parent,
                                    val,
                                    child_keys,
                                    height + 1,
                                    width,
                                    num_nodes + 1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            rv[parent] = val
                            break
                    else:
                        for child_info in children_info:
                            rv[child_info[0]] = child_info[1]
                            reducible_remove(child_info[0])
                        if children_stack:
                            # Allow the parent to be fused, but only under strict circumstances.
                            # Ensure that linear chains may still be fused.
                            if width > max_width:
                                width = max_width
                            if fudge > int(ave_width - 1):
                                fudge = int(ave_width - 1)
                            # key, task, height, width, number of nodes, fudge, set of edges
                            # This task *implicitly* depends on `edges`
                            info_stack_append(
                                (
                                    parent,
                                    rv[parent],
                                    [parent] if rename_keys else None,
                                    1,
                                    width,
                                    1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            break
                # Traverse upwards
                parent = rdeps[parent][0]

    if fuse_subgraphs:
        _inplace_fuse_subgraphs(rv, keys, deps, fused_trees, rename_keys)

    if key_renamer:
        for root_key, fused_keys in fused_trees.items():
            alias = key_renamer(fused_keys)
            if alias is not None and alias not in rv:
                rv[alias] = rv[root_key]
                rv[root_key] = alias
                deps[alias] = deps[root_key]
                deps[root_key] = {alias}

    return rv, deps