def optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] keys = list(core.flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) else: # Perform Blockwise optimizations for HLG input dsk = optimize_dataframe_getitem(dsk, keys=keys) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Do not perform low-level fusion unless the user has # specified True explicitly. The configuration will # be None by default. if not config.get("optimization.fuse.active"): return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) fuse_subgraphs = config.get("optimization.fuse.subgraphs") if fuse_subgraphs is None: fuse_subgraphs = True dsk, _ = fuse( dsk, keys, dependencies=dependencies, fuse_subgraphs=fuse_subgraphs, ) dsk, _ = cull(dsk, keys) return dsk
def test_env_var_normalization(monkeypatch): value = 3 monkeypatch.setenv('DASK_A_B', value) d = {} dask.config.refresh(config=d) assert get('a_b', config=d) == value assert get('a-b', config=d) == value
def test_env_var_canonical_name(monkeypatch): value = 3 monkeypatch.setenv("DASK_A_B", str(value)) d = {} dask.config.refresh(config=d) assert get("a_b", config=d) == value assert get("a-b", config=d) == value
def __init__( self, annotations: Mapping[str, Any] = None, collection_annotations: Mapping[str, Any] = None, ): """Initialize Layer object. Parameters ---------- annotations : Mapping[str, Any], optional By default, None. Annotations are metadata or soft constraints associated with tasks that dask schedulers may choose to respect: They signal intent without enforcing hard constraints. As such, they are primarily designed for use with the distributed scheduler. See the dask.annotate function for more information. collection_annotations : Mapping[str, Any], optional. By default, None. Experimental, intended to assist with visualizing the performance characteristics of Dask computations. These annotations are *not* passed to the distributed scheduler. """ self.annotations = annotations or copy.copy( config.get("annotations", None)) self.collection_annotations = collection_annotations or copy.copy( config.get("collection_annotations", None))
def test_get(): d = {"x": 1, "y": {"a": 2}} assert get("x", config=d) == 1 assert get("y.a", config=d) == 2 assert get("y.b", 123, config=d) == 123 with pytest.raises(KeyError): get("y.b", config=d)
def test_get(): d = {'x': 1, 'y': {'a': 2}} assert get('x', config=d) == 1 assert get('y.a', config=d) == 2 assert get('y.b', 123, config=d) == 123 with pytest.raises(KeyError): get('y.b', config=d)
def test_custom_yaml(tmpdir): custom_config = {} custom_config["sql"] = dask_config.get("sql") custom_config["sql"]["groupby"]["split_out"] = 16 custom_config["sql"]["foo"] = {"bar": [1, 2, 3], "baz": None} with open(os.path.join(tmpdir, "custom-sql.yaml"), mode="w") as f: yaml.dump(custom_config, f) dask_config.refresh( paths=[tmpdir]) # Refresh config to read from updated environment assert custom_config["sql"] == dask_config.get("sql") dask_config.refresh()
def rearrange_by_column( df, col, npartitions=None, max_branch=None, shuffle=None, compute=None, ignore_index=False, ): shuffle = shuffle or config.get("shuffle", None) or "disk" # if the requested output partitions < input partitions # we repartition first as shuffling overhead is # proportionate to the number of input partitions if npartitions is not None and npartitions < df.npartitions: df = df.repartition(npartitions=npartitions) if shuffle == "disk": return rearrange_by_column_disk(df, col, npartitions, compute=compute) elif shuffle == "tasks": df2 = rearrange_by_column_tasks(df, col, max_branch, npartitions, ignore_index=ignore_index) if ignore_index: df2._meta = df2._meta.reset_index(drop=True) return df2 elif shuffle == "p2p": from distributed.shuffle import rearrange_by_column_p2p return rearrange_by_column_p2p(df, col, npartitions) else: raise NotImplementedError("Unknown shuffle method %s" % shuffle)
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs): """ Convert many collections into a single dask graph, after optimization """ from dask.highlevelgraph import HighLevelGraph optimizations = tuple(optimizations) + tuple(config.get("optimizations", ())) if optimize_graph: groups = groupby(optimization_function, collections) graphs = [] for opt, val in groups.items(): dsk, keys = _extract_graph_and_keys(val) dsk = opt(dsk, keys, **kwargs) for opt_inner in optimizations: dsk = opt_inner(dsk, keys, **kwargs) graphs.append(dsk) # Merge all graphs if any(isinstance(graph, HighLevelGraph) for graph in graphs): dsk = HighLevelGraph.merge(*graphs) else: dsk = merge(*map(ensure_dict, graphs)) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def test_default_config(): config_fn = os.path.join(os.path.dirname(__file__), "../../dask_sql", "sql.yaml") with open(config_fn) as f: default_config = yaml.safe_load(f) assert "sql" in default_config assert default_config["sql"] == dask_config.get("sql")
def aggregate_chunks(existing_chunks: Iterable[int], item_size: int, subdivision: int = 1): target_size_bytes = int( Quantity(config.get("array.chunk-size")).m_as("bytes")) # The optimal number of data per Dask chunk. target_size = target_size_bytes // item_size # Try to aggregate the input data into the fewest possible Dask chunks. new_chunks = [] for chunk in existing_chunks: # If this input data set will fit into the current chunk, add it. if new_chunks and new_chunks[-1] + chunk <= target_size: new_chunks[-1] += chunk # If the current chunk is full (or the chunks list is empty), add this # data set to the next chunk. elif chunk <= target_size: new_chunks.append(chunk) # If this data set is larger than the max Dask chunk size, split it # along the HDF5 data set chunk boundaries and put the pieces in # separate Dask chunks. else: n_whole_chunks, remainder = divmod(chunk, target_size) dask_chunk_size = target_size // subdivision * subdivision new_chunks += [dask_chunk_size] * n_whole_chunks + [remainder] return new_chunks
def get_features_kwarg( name: str, scheduler: Optional[str] = None, queue_type: Optional[str] = None, default: Optional[Any] = None, ) -> Optional[Any]: """Searches in the jobqueue_features config for a value for kw_name. Args: name: The key to search for in config. scheduler: The name of scheduler's config for which search is taken. queue_type: The queue type to search for in config. default: A default value to give if nothing in config files. Returns: Found value or the default value. """ value = None # search for kw_name from bottom up queue_type -> scheduler -> jobqueue_features # Error checking if not isinstance(name, str): raise ValueError('"name" must be a string') # if scheduler is None and queue_type is not None: if scheduler is None and queue_type is not None: raise ValueError( "Cannot search in queue_type without providing a scheduler") # Now do the config search # use default=None in calls since we set defaults ourselves if scheduler is not None and queue_type is not None: value = config.get( "jobqueue-features.{}.queue-type.{}.{}".format( scheduler, queue_type, name), default=None, ) if value is None and scheduler is not None: value = config.get("jobqueue-features.{}.{}".format(scheduler, name), default=None) if value is None: value = config.get("jobqueue-features.{}".format(name), default=None) if value is None and default is not None: value = default return value
def test_getitem_avoids_large_chunks(): a = np.arange(4 * 500 * 500).reshape(4, 500, 500) arr = da.from_array(a, chunks=(1, 500, 500)) indexer = [0, 1] + [2] * 100 + [3] result = arr[indexer] chunk_size = utils.parse_bytes(config.get("array.chunk-size")) assert all(x.nbytes < chunk_size for x in result.blocks) expected = a[indexer] assert_eq(result, expected)
def optimize( dsk, keys, fuse_keys=None, fast_functions=None, inline_functions_fast_functions=(getter_inline,), rename_fused_keys=True, **kwargs, ): """Optimize dask for array computation 1. Cull tasks not necessary to evaluate keys 2. Remove full slicing, e.g. x[:] 3. Inline fast functions like getitem and np.transpose """ if not isinstance(keys, (list, set)): keys = [keys] keys = list(flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Perform low-level fusion unless the user has # specified False explicitly. if config.get("optimization.fuse.active") is False: return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) # Low level task optimizations if fast_functions is not None: inline_functions_fast_functions = fast_functions hold = hold_keys(dsk, dependencies) dsk, dependencies = fuse( dsk, hold + keys + (fuse_keys or []), dependencies, rename_keys=rename_fused_keys, ) if inline_functions_fast_functions: dsk = inline_functions( dsk, keys, dependencies=dependencies, fast_functions=inline_functions_fast_functions, ) return optimize_slices(dsk)
def safe_file_url(url, start=None): """Formats an URL so that it meets the following safety conditions: - the URL starts with file:// (else: raises NotImplementedError) - the path is absolute (relative paths are taken relative to geomodeling.root) - if geomodeling.strict_paths: the path has to be contained inside `start` (else: raises IOError) For backwards compatibility, geomodeling.root can be overriden using the 'start' argument. """ try: protocol, path = url.split("://") except ValueError: protocol = "file" path = url else: if protocol != "file": raise NotImplementedError( 'Unknown protocol: "{}"'.format(protocol)) if start is not None: warnings.warn( "Using the start argument in safe_file_url is deprecated. Use the " "'geomodeling.root' in the dask config", DeprecationWarning, ) else: start = config.get("geomodeling.root") if not os.path.isabs(path): if start is None: raise IOError( "Relative path '{}' provided but start was not given.".format( path)) abspath = os.path.abspath(os.path.join(start, path)) else: abspath = os.path.abspath(path) strict = config.get("geomodeling.strict-file-paths") if strict and not abspath.startswith(start): raise IOError("'{}' is not contained in '{}'".format(path, start)) return "://".join([protocol, abspath])
def _set_metadata_task_size(metadata_task_size, fs): # Set metadata_task_size using the config file # if the kwarg value was not specified if metadata_task_size is None: # If a default value is not specified in the config file, # otherwise we use "0" config_str = "dataframe.parquet.metadata-task-size-" + ( "local" if _is_local_fs(fs) else "remote") return config.get(config_str, 0) return metadata_task_size
def normalize_array(x): if not x.shape: return (x.item(), x.dtype) if hasattr(x, "mode") and getattr(x, "filename", None): if hasattr(x.base, "ctypes"): offset = ( x.ctypes._as_parameter_.value - x.base.ctypes._as_parameter_.value ) else: offset = 0 # root memmap's have mmap object as base if hasattr( x, "offset" ): # offset numpy used while opening, and not the offset to the beginning of the file offset += getattr(x, "offset") return ( x.filename, os.path.getmtime(x.filename), x.dtype, x.shape, x.strides, offset, ) if x.dtype.hasobject: try: try: # string fast-path data = hash_buffer_hex( "-".join(x.flat).encode( encoding="utf-8", errors="surrogatepass" ) ) except UnicodeDecodeError: # bytes fast-path data = hash_buffer_hex(b"-".join(x.flat)) except (TypeError, UnicodeDecodeError): try: data = hash_buffer_hex(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) except Exception: # pickling not supported, use UUID4-based fallback if not config.get("tokenize.ensure-deterministic"): data = uuid.uuid4().hex else: raise RuntimeError( f"``np.ndarray`` with object ``dtype`` {str(x)} cannot " "be deterministically hashed. Please, see " "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing " # noqa: E501 "for more information" ) else: try: data = hash_buffer_hex(x.ravel(order="K").view("i1")) except (BufferError, AttributeError, ValueError): data = hash_buffer_hex(x.copy().ravel(order="K").view("i1")) return (data, x.dtype, x.shape, x.strides)
def get_context(): """Return the current multiprocessing context.""" # fork context does fork()-without-exec(), which can lead to deadlocks, # so default to "spawn". context_name = config.get("multiprocessing.context", "spawn") if sys.platform == "win32": if context_name != "spawn": # Only spawn is supported on Win32, can't change it: warn(_CONTEXT_UNSUPPORTED, UserWarning) return multiprocessing else: return multiprocessing.get_context(context_name)
def get_cluster(scheduler: Optional[str] = None, **kwargs) -> "ClusterType": if scheduler is None: scheduler = config.get("jobqueue-features.scheduler", default=None) if scheduler is None: raise ValueError("You must configure a scheduler either via a kwarg" " or in your configuration file") if scheduler == SLURM: return CustomSLURMCluster(**kwargs) else: raise NotImplementedError( "Scheduler {} is not in list of supported schedulers: {}".format( scheduler, SUPPORTED_SCHEDULERS))
def _update_kwargs_job_extra(self, **kwargs) -> Dict[str, Any]: job_extra = kwargs.get("job_extra", self.get_kwarg("job-extra")) if job_extra is None: job_extra = config.get( "jobqueue.{}.job_extra".format(self.scheduler_name), default=[] ) # order matters, to ensure user has power to be in control make sure their # settings come last final_job_extra = self.gpu_job_extra final_job_extra.extend(job_extra) kwargs.update({"job_extra": final_job_extra}) return kwargs
def _normalize_seq_func(seq): # Defined outside normalize_seq to avoid unneccessary redefinitions and # therefore improving computation times. try: return list(map(normalize_token, seq)) except RecursionError: if not config.get("tokenize.ensure-deterministic"): return uuid.uuid4().hex raise RuntimeError( f"Sequence {str(seq)} cannot be deterministically hashed. Please, see " "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing " "for more information" )
def _update_kwargs_env_extra(self, **kwargs) -> Dict[str, Any]: if self.openmp_env_extra is None: return kwargs env_extra = kwargs.get("env_extra", self.get_kwarg("env-extra")) if not env_extra: env_extra = config.get( "jobqueue.{}.env_extra".format(self.scheduler_name), default=[] ) # order matters, make sure user has power to be in control, explicit user set # stuff comes last final_env_extra = self.openmp_env_extra final_env_extra.extend(env_extra) kwargs.update({"env_extra": final_env_extra}) return kwargs
def get_features_kwarg(name, scheduler=None, queue_type=None, default=None): """ Search in the jobqueue_features config for a value for kw_name :param scheduler: scheduler name to search for in configuration :param name: string to search for in configuration :param queue_type: queue type to search for in config :param default: default value to give if nothing in config files :return: value or None """ value = None # search for kw_name from bottom up queue_type -> scheduler -> jobqueue_features # Error checking if not isinstance(name, str): raise ValueError('"name" must be a string') # if scheduler is None and queue_type is not None: if scheduler is None and queue_type is not None: raise ValueError( "Cannot search in queue_type without providing a scheduler") # Now do the config search # use default=None in calls since we set defaults ourselves if scheduler is not None and queue_type is not None: value = config.get( "jobqueue-features.{}.queue-type.{}.{}".format( scheduler, queue_type, name), default=None, ) if value is None and scheduler is not None: value = config.get("jobqueue-features.{}.{}".format(scheduler, name), default=None) if value is None: value = config.get("jobqueue-features.{}".format(name), default=None) if value is None and default is not None: value = default return value
def get_sources_and_requests(self, **request): # first handle the 'time' and 'meta' requests mode = request["mode"] if mode == "time": return [(self.period[-1], None), ({"mode": "time"}, None)] elif mode == "meta": return [(None, None), ({"mode": "meta"}, None)] elif mode != "vals": raise ValueError("Unknown mode '{}'".format(mode)) # build the request to be sent to the geometry source x1, y1, x2, y2 = request["bbox"] width, height = request["width"], request["height"] # be strict about the bbox, it may lead to segfaults else if x2 == x1 and y2 == y1: # point min_size = None elif x1 < x2 and y1 < y2: min_size = min((x2 - x1) / width, (y2 - y1) / height) else: raise ValueError("Invalid bbox ({})".format(request["bbox"])) limit = self.limit if self.limit is None: limit = config.get("geomodeling.geometry-limit") geom_request = { "mode": "intersects", "geometry": box(*request["bbox"]), "projection": request["projection"], "min_size": min_size, "limit": limit, "start": request.get("start"), "stop": request.get("stop"), } # keep some variables for use in process() process_kwargs = { "mode": "vals", "column_name": self.column_name, "dtype": self.dtype, "no_data_value": self.fillvalue, "width": width, "height": height, "bbox": request["bbox"], } return [(self.source, geom_request), (process_kwargs, None)]
def tokenize(*args, pure=None, **kwargs): """Mapping function from task -> consistent name. Parameters ---------- args : object Python objects that summarize the task. pure : boolean, optional If True, a consistent hash function is tried on the input. If this fails, then a unique identifier is used. If False (default), then a unique identifier is always used. """ if pure is None: pure = config.get("delayed_pure", False) if pure: return _tokenize(*args, **kwargs) else: return str(uuid.uuid4())
def normalize_object(o): method = getattr(o, "__dask_tokenize__", None) if method is not None: return method() if callable(o): return normalize_function(o) if dataclasses.is_dataclass(o): return normalize_dataclass(o) if not config.get("tokenize.ensure-deterministic"): return uuid.uuid4().hex raise RuntimeError( f"Object {str(o)} cannot be deterministically hashed. Please, see " "https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing " "for more information" )
def filter_or_scalar(df: dd.DataFrame, filter_condition: Union[np.bool_, dd.Series]): """ Some (complex) SQL queries can lead to a strange condition which is always true or false. We do not need to filter in this case. See https://github.com/dask-contrib/dask-sql/issues/87. """ if np.isscalar(filter_condition): if not filter_condition: # pragma: no cover # empty dataset logger.warning( "Join condition is always false - returning empty dataset") return df.head(0, compute=False) else: return df # In SQL, a NULL in a boolean is False on filtering filter_condition = filter_condition.fillna(False) out = df[filter_condition] if dask_config.get("sql.predicate_pushdown"): return attempt_predicate_pushdown(out) else: return out
def get_async( submit, num_workers, dsk, result, cache=None, get_id=default_get_id, rerun_exceptions_locally=None, pack_exception=default_pack_exception, raise_exception=reraise, callbacks=None, dumps=identity, loads=identity, chunksize=None, **kwargs, ): """Asynchronous get function This is a general version of various asynchronous schedulers for dask. It takes a ``concurrent.futures.Executor.submit`` function to form a more specific ``get`` method that walks through the dask array with parallel workers, avoiding repeat computation and minimizing memory use. Parameters ---------- submit : function A ``concurrent.futures.Executor.submit`` function num_workers : int The number of workers that task submissions can be spread over dsk : dict A dask dictionary specifying a workflow result : key or list of keys Keys corresponding to desired data cache : dict-like, optional Temporary storage of results get_id : callable, optional Function to return the worker id, takes no arguments. Examples are `threading.current_thread` and `multiprocessing.current_process`. rerun_exceptions_locally : bool, optional Whether to rerun failing tasks in local process to enable debugging (False by default) pack_exception : callable, optional Function to take an exception and ``dumps`` method, and return a serialized tuple of ``(exception, traceback)`` to send back to the scheduler. Default is to just raise the exception. raise_exception : callable, optional Function that takes an exception and a traceback, and raises an error. callbacks : tuple or list of tuples, optional Callbacks are passed in as tuples of length 5. Multiple sets of callbacks may be passed in as a list of tuples. For more information, see the dask.diagnostics documentation. dumps: callable, optional Function to serialize task data and results to communicate between worker and parent. Defaults to identity. loads: callable, optional Inverse function of `dumps`. Defaults to identity. chunksize: int, optional Size of chunks to use when dispatching work. Defaults to 1. If -1, will be computed to evenly divide ready work across workers. See Also -------- threaded.get """ chunksize = chunksize or config.get("chunksize", 1) queue = Queue() if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = {result} results = set(result_flat) dsk = dict(dsk) with local_callbacks(callbacks) as callbacks: _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks) started_cbs = [] succeeded = False # if start_state_from_dask fails, we will have something # to pass to the final block. state = {} try: for cb in callbacks: if cb[0]: cb[0](dsk) started_cbs.append(cb) keyorder = order(dsk) state = start_state_from_dask(dsk, cache=cache, sortkey=keyorder.get) for _, start_state, _, _, _ in callbacks: if start_state: start_state(dsk, state) if rerun_exceptions_locally is None: rerun_exceptions_locally = config.get( "rerun_exceptions_locally", False) if state["waiting"] and not state["ready"]: raise ValueError("Found no accessible jobs in dask") def fire_tasks(chunksize): """Fire off a task to the thread pool""" # Determine chunksize and/or number of tasks to submit nready = len(state["ready"]) if chunksize == -1: ntasks = nready chunksize = -(ntasks // -num_workers) else: used_workers = -(len(state["running"]) // -chunksize) avail_workers = max(num_workers - used_workers, 0) ntasks = min(nready, chunksize * avail_workers) # Prep all ready tasks for submission args = [] for _ in range(ntasks): # Get the next task to compute (most recently added) key = state["ready"].pop() # Notify task is running state["running"].add(key) for f in pretask_cbs: f(key, dsk, state) # Prep args to send data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } args.append(( key, dumps((dsk[key], data)), dumps, loads, get_id, pack_exception, )) # Batch submit for i in range(-(len(args) // -chunksize)): each_args = args[i * chunksize:(i + 1) * chunksize] if not each_args: break fut = submit(batch_execute_tasks, each_args) fut.add_done_callback(queue.put) # Main loop, wait on tasks to finish, insert new ones while state["waiting"] or state["ready"] or state["running"]: fire_tasks(chunksize) for key, res_info, failed in queue_get(queue).result(): if failed: exc, tb = loads(res_info) if rerun_exceptions_locally: data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } task = dsk[key] _execute_task(task, data) # Re-execute locally else: raise_exception(exc, tb) res, worker_id = loads(res_info) state["cache"][key] = res finish_task(dsk, key, state, results, keyorder.get) for f in posttask_cbs: f(key, res, dsk, state, worker_id) succeeded = True finally: for _, _, _, _, finish in started_cbs: if finish: finish(dsk, state, not succeeded) return nested_get(result, state["cache"])
def start_state_from_dask(dsk, cache=None, sortkey=None): """Start state from a dask Examples -------- >>> inc = lambda x: x + 1 >>> add = lambda x, y: x + y >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')} # doctest: +SKIP >>> from pprint import pprint # doctest: +SKIP >>> pprint(start_state_from_dask(dsk)) # doctest: +SKIP {'cache': {'x': 1, 'y': 2}, 'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}}, 'dependents': defaultdict(None, {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}}), 'finished': set(), 'ready': ['z'], 'released': set(), 'running': set(), 'waiting': {'w': {'z'}}, 'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}} """ if sortkey is None: sortkey = order(dsk).get if cache is None: cache = config.get("cache", None) if cache is None: cache = dict() data_keys = set() for k, v in dsk.items(): if not has_tasks(dsk, v): cache[k] = v data_keys.add(k) dsk2 = dsk.copy() dsk2.update(cache) dependencies = {k: get_dependencies(dsk2, k) for k in dsk} waiting = { k: v.copy() for k, v in dependencies.items() if k not in data_keys } dependents = reverse_dict(dependencies) for a in cache: for b in dependents.get(a, ()): waiting[b].remove(a) waiting_data = {k: v.copy() for k, v in dependents.items() if v} ready_set = {k for k, v in waiting.items() if not v} ready = sorted(ready_set, key=sortkey, reverse=True) waiting = {k: v for k, v in waiting.items() if v} state = { "dependencies": dependencies, "dependents": dependents, "waiting": waiting, "waiting_data": waiting_data, "cache": cache, "ready": ready, "running": set(), "finished": set(), "released": set(), } return state
def fuse( dsk, keys=None, dependencies=None, ave_width=_default, max_width=_default, max_height=_default, max_depth_new_edges=_default, rename_keys=_default, fuse_subgraphs=_default, ): """Fuse tasks that form reductions; more advanced than ``fuse_linear`` This trades parallelism opportunities for faster scheduling by making tasks less granular. It can replace ``fuse_linear`` in optimization passes. This optimization applies to all reductions--tasks that have at most one dependent--so it may be viewed as fusing "multiple input, single output" groups of tasks into a single task. There are many parameters to fine tune the behavior, which are described below. ``ave_width`` is the natural parameter with which to compare parallelism to granularity, so it should always be specified. Reasonable values for other parameters will be determined using ``ave_width`` if necessary. Parameters ---------- dsk: dict dask graph keys: list or set, optional Keys that must remain in the returned dask graph dependencies: dict, optional {key: [list-of-keys]}. Must be a list to provide count of each key This optional input often comes from ``cull`` ave_width: float (default 1) Upper limit for ``width = num_nodes / height``, a good measure of parallelizability. dask.config key: ``optimization.fuse.ave-width`` max_width: int (default infinite) Don't fuse if total width is greater than this. dask.config key: ``optimization.fuse.max-width`` max_height: int or None (default None) Don't fuse more than this many levels. Set to None to dynamically adjust to ``1.5 + ave_width * log(ave_width + 1)``. dask.config key: ``optimization.fuse.max-height`` max_depth_new_edges: int or None (default None) Don't fuse if new dependencies are added after this many levels. Set to None to dynamically adjust to ave_width * 1.5. dask.config key: ``optimization.fuse.max-depth-new-edges`` rename_keys: bool or func, optional (default True) Whether to rename the fused keys with ``default_fused_keys_renamer`` or not. Renaming fused keys can keep the graph more understandable and comprehensive, but it comes at the cost of additional processing. If False, then the top-most key will be used. For advanced usage, a function to create the new name is also accepted. dask.config key: ``optimization.fuse.rename-keys`` fuse_subgraphs : bool or None, optional (default None) Whether to fuse multiple tasks into ``SubgraphCallable`` objects. Set to None to let the default optimizer of individual dask collections decide. If no collection-specific default exists, None defaults to False. dask.config key: ``optimization.fuse.subgraphs`` Returns ------- dsk output graph with keys fused dependencies dict mapping dependencies after fusion. Useful side effect to accelerate other downstream optimizations. """ # Perform low-level fusion unless the user has # specified False explicitly. if config.get("optimization.fuse.active") is False: return dsk, dependencies if keys is not None and not isinstance(keys, set): if not isinstance(keys, list): keys = [keys] keys = set(flatten(keys)) # Read defaults from dask.yaml and/or user-defined config file if ave_width is _default: ave_width = config.get("optimization.fuse.ave-width") assert ave_width is not _default if max_height is _default: max_height = config.get("optimization.fuse.max-height") assert max_height is not _default if max_depth_new_edges is _default: max_depth_new_edges = config.get("optimization.fuse.max-depth-new-edges") assert max_depth_new_edges is not _default if max_depth_new_edges is None: max_depth_new_edges = ave_width * 1.5 if max_width is _default: max_width = config.get("optimization.fuse.max-width") assert max_width is not _default if max_width is None: max_width = 1.5 + ave_width * math.log(ave_width + 1) if fuse_subgraphs is _default: fuse_subgraphs = config.get("optimization.fuse.subgraphs") assert fuse_subgraphs is not _default if fuse_subgraphs is None: fuse_subgraphs = False if not ave_width or not max_height: return dsk, dependencies if rename_keys is _default: rename_keys = config.get("optimization.fuse.rename-keys") assert rename_keys is not _default if rename_keys is True: key_renamer = default_fused_keys_renamer elif rename_keys is False: key_renamer = None elif not callable(rename_keys): raise TypeError("rename_keys must be a boolean or callable") else: key_renamer = rename_keys rename_keys = key_renamer is not None if dependencies is None: deps = {k: get_dependencies(dsk, k, as_list=True) for k in dsk} else: deps = dict(dependencies) rdeps = {} for k, vals in deps.items(): for v in vals: if v not in rdeps: rdeps[v] = [k] else: rdeps[v].append(k) deps[k] = set(vals) reducible = {k for k, vals in rdeps.items() if len(vals) == 1} if keys: reducible -= keys for k, v in dsk.items(): if type(v) is not tuple and not isinstance(v, (numbers.Number, str)): reducible.discard(k) if not reducible and ( not fuse_subgraphs or all(len(set(v)) != 1 for v in rdeps.values()) ): # Quick return if there's nothing to do. Only progress if there's tasks # fusible by the main `fuse`, or by `fuse_subgraphs` if enabled. return dsk, deps rv = dsk.copy() fused_trees = {} # These are the stacks we use to store data as we traverse the graph info_stack = [] children_stack = [] # For speed deps_pop = deps.pop reducible_add = reducible.add reducible_pop = reducible.pop reducible_remove = reducible.remove fused_trees_pop = fused_trees.pop info_stack_append = info_stack.append info_stack_pop = info_stack.pop children_stack_append = children_stack.append children_stack_extend = children_stack.extend children_stack_pop = children_stack.pop while reducible: parent = reducible_pop() reducible_add(parent) while parent in reducible: # Go to the top parent = rdeps[parent][0] children_stack_append(parent) children_stack_extend(reducible & deps[parent]) while True: child = children_stack[-1] if child != parent: children = reducible & deps[child] while children: # Depth-first search children_stack_extend(children) parent = child child = children_stack[-1] children = reducible & deps[child] children_stack_pop() # This is a leaf node in the reduction region # key, task, fused_keys, height, width, number of nodes, fudge, set of edges info_stack_append( ( child, rv[child], [child] if rename_keys else None, 1, 1, 1, 0, deps[child] - reducible, ) ) else: children_stack_pop() # Calculate metrics and fuse as appropriate deps_parent = deps[parent] edges = deps_parent - reducible children = deps_parent - edges num_children = len(children) if num_children == 1: ( child_key, child_task, child_keys, height, width, num_nodes, fudge, children_edges, ) = info_stack_pop() num_children_edges = len(children_edges) if fudge > num_children_edges - 1 >= 0: fudge = num_children_edges - 1 edges |= children_edges no_new_edges = len(edges) == num_children_edges if not no_new_edges: fudge += 1 if ( (num_nodes + fudge) / height <= ave_width and # Sanity check; don't go too deep if new levels introduce new edge dependencies (no_new_edges or height < max_depth_new_edges) ): # Perform substitutions as we go val = subs(dsk[parent], child_key, child_task) deps_parent.remove(child_key) deps_parent |= deps_pop(child_key) del rv[child_key] reducible_remove(child_key) if rename_keys: child_keys.append(parent) fused_trees[parent] = child_keys fused_trees_pop(child_key, None) if children_stack: if no_new_edges: # Linear fuse info_stack_append( ( parent, val, child_keys, height, width, num_nodes, fudge, edges, ) ) else: info_stack_append( ( parent, val, child_keys, height + 1, width, num_nodes + 1, fudge, edges, ) ) else: rv[parent] = val break else: rv[child_key] = child_task reducible_remove(child_key) if children_stack: # Allow the parent to be fused, but only under strict circumstances. # Ensure that linear chains may still be fused. if fudge > int(ave_width - 1): fudge = int(ave_width - 1) # This task *implicitly* depends on `edges` info_stack_append( ( parent, rv[parent], [parent] if rename_keys else None, 1, width, 1, fudge, edges, ) ) else: break else: child_keys = [] height = 1 width = 0 num_single_nodes = 0 num_nodes = 0 fudge = 0 children_edges = set() max_num_edges = 0 children_info = info_stack[-num_children:] del info_stack[-num_children:] for ( cur_key, cur_task, cur_keys, cur_height, cur_width, cur_num_nodes, cur_fudge, cur_edges, ) in children_info: if cur_height == 1: num_single_nodes += 1 elif cur_height > height: height = cur_height width += cur_width num_nodes += cur_num_nodes fudge += cur_fudge if len(cur_edges) > max_num_edges: max_num_edges = len(cur_edges) children_edges |= cur_edges # Fudge factor to account for possible parallelism with the boundaries num_children_edges = len(children_edges) fudge += min( num_children - 1, max(0, num_children_edges - max_num_edges) ) if fudge > num_children_edges - 1 >= 0: fudge = num_children_edges - 1 edges |= children_edges no_new_edges = len(edges) == num_children_edges if not no_new_edges: fudge += 1 if ( (num_nodes + fudge) / height <= ave_width and num_single_nodes <= ave_width and width <= max_width and height <= max_height and # Sanity check; don't go too deep if new levels introduce new edge dependencies (no_new_edges or height < max_depth_new_edges) ): # Perform substitutions as we go val = dsk[parent] children_deps = set() for child_info in children_info: cur_child = child_info[0] val = subs(val, cur_child, child_info[1]) del rv[cur_child] children_deps |= deps_pop(cur_child) reducible_remove(cur_child) if rename_keys: fused_trees_pop(cur_child, None) child_keys.extend(child_info[2]) deps_parent -= children deps_parent |= children_deps if rename_keys: child_keys.append(parent) fused_trees[parent] = child_keys if children_stack: info_stack_append( ( parent, val, child_keys, height + 1, width, num_nodes + 1, fudge, edges, ) ) else: rv[parent] = val break else: for child_info in children_info: rv[child_info[0]] = child_info[1] reducible_remove(child_info[0]) if children_stack: # Allow the parent to be fused, but only under strict circumstances. # Ensure that linear chains may still be fused. if width > max_width: width = max_width if fudge > int(ave_width - 1): fudge = int(ave_width - 1) # key, task, height, width, number of nodes, fudge, set of edges # This task *implicitly* depends on `edges` info_stack_append( ( parent, rv[parent], [parent] if rename_keys else None, 1, width, 1, fudge, edges, ) ) else: break # Traverse upwards parent = rdeps[parent][0] if fuse_subgraphs: _inplace_fuse_subgraphs(rv, keys, deps, fused_trees, rename_keys) if key_renamer: for root_key, fused_keys in fused_trees.items(): alias = key_renamer(fused_keys) if alias is not None and alias not in rv: rv[alias] = rv[root_key] rv[root_key] = alias deps[alias] = deps[root_key] deps[root_key] = {alias} return rv, deps