def get(dsk, result, cache=None, **kwargs): """ Threaded cached implementation of dask.get Parameters ---------- dsk: dict A dask dictionary specifying a workflow result: key or list of keys Keys corresponding to desired data nthreads: integer of thread count The number of threads to use in the ThreadPool that will actually execute tasks cache: dict-like (optional) Temporary storage of results Examples -------- >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')} >>> get(dsk, 'w') 4 >>> get(dsk, ['w', 'y']) (4, 2) """ pool = _globals['pool'] if pool is None: pool = default_pool queue = Queue() results = get_async(pool.apply_async, len(pool._pool), dsk, result, cache=cache, queue=queue, get_id=_thread_get_id, **kwargs) return results
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count): """ Multiprocessed get function appropriate for Bags """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(psutil.cpu_count()) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = dill_apply_async(pool.apply_async) # Optimize Dask dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations) try: # Run result = get_async(apply_async, cpu_count, dsk2, keys, queue=queue) finally: if cleanup: pool.close() return result
def get(dsk, keys, num_workers=None, func_loads=None, func_dumps=None, optimize_graph=True, **kwargs): """ Multiprocessed get function appropriate for Bags Parameters ---------- dsk : dict dask graph keys : object or list Desired results from graph num_workers : int Number of worker processes (defaults to number of cores) func_dumps : function Function to use for function serialization (defaults to cloudpickle.dumps) func_loads : function Function to use for function deserialization (defaults to cloudpickle.loads) optimize_graph : bool If True [default], `fuse` is applied to the graph before computation. """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(num_workers) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = pickle_apply_async(pool.apply_async, func_dumps=func_dumps, func_loads=func_loads) # Optimize Dask dsk2, dependencies = cull(dsk, keys) if optimize_graph: dsk3, dependencies = fuse(dsk2, keys, dependencies) else: dsk3 = dsk2 try: # Run result = get_async(apply_async, len(pool._pool), dsk3, keys, queue=queue, get_id=_process_get_id, **kwargs) finally: if cleanup: pool.close() return result
def get(dsk, keys, optimizations=[], num_workers=None, func_loads=None, func_dumps=None, **kwargs): """ Multiprocessed get function appropriate for Bags Parameters ---------- dsk: dict dask graph keys: object or list Desired results from graph optimizations: list of functions optimizations to perform on graph before execution num_workers: int Number of worker processes (defaults to number of cores) func_dumps: function Function to use for function serialization (defaults to cloudpickle.dumps) func_loads: function Function to use for function deserialization (defaults to cloudpickle.loads) """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(num_workers) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = pickle_apply_async(pool.apply_async, func_dumps=func_dumps, func_loads=func_loads) # Optimize Dask dsk2 = fuse(dsk, keys) dsk3 = pipe(dsk2, partial(cull, keys=keys), *optimizations) try: # Run result = get_async(apply_async, len(pool._pool), dsk3, keys, queue=queue, get_id=_process_get_id, **kwargs) finally: if cleanup: pool.close() return result
def get(dsk, result, cache=None, num_workers=None, **kwargs): """ Threaded cached implementation of dask.get Parameters ---------- dsk: dict A dask dictionary specifying a workflow result: key or list of keys Keys corresponding to desired data num_workers: integer of thread count The number of threads to use in the ThreadPool that will actually execute tasks cache: dict-like (optional) Temporary storage of results Examples -------- >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')} >>> get(dsk, 'w') 4 >>> get(dsk, ['w', 'y']) (4, 2) """ global default_pool pool = _globals['pool'] thread = current_thread() with pools_lock: if pool is None: if num_workers is None and thread is main_thread: if default_pool is None: default_pool = ThreadPool() pool = default_pool elif thread in pools and num_workers in pools[thread]: pool = pools[thread][num_workers] else: pool = ThreadPool(num_workers) pools[thread][num_workers] = pool results = get_async(pool.apply_async, len(pool._pool), dsk, result, cache=cache, get_id=_thread_get_id, **kwargs) # Cleanup pools associated to dead threads with pools_lock: active_threads = set(threading.enumerate()) if thread is not main_thread: for t in list(pools): if t not in active_threads: for p in pools.pop(t).values(): p.close() return results
def get(dsk, result, nthreads=NUM_CPUS, cache=None, debug_counts=None, **kwargs): """ Threaded cached implementation of dask.get Parameters ---------- dsk: dict A dask dictionary specifying a workflow result: key or list of keys Keys corresponding to desired data nthreads: integer of thread count The number of threads to use in the ThreadPool that will actually execute tasks cache: dict-like (optional) Temporary storage of results debug_counts: integer or None This integer tells how often the scheduler should dump debugging info Examples -------- >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')} >>> get(dsk, 'w') 4 >>> get(dsk, ['w', 'y']) (4, 2) """ pool = _globals['pool'] if pool is None: pool = ThreadPool(nthreads) cleanup = True else: cleanup = False queue = Queue() try: results = get_async(pool.apply_async, nthreads, dsk, result, cache=cache, debug_counts=debug_counts, queue=queue, **kwargs) finally: if cleanup: pool.close() pool.join() return results
def get(dsk, keys, num_workers=None, func_loads=None, func_dumps=None, optimize_graph=True, **kwargs): """ Multiprocessed get function appropriate for Bags Parameters ---------- dsk : dict dask graph keys : object or list Desired results from graph num_workers : int Number of worker processes (defaults to number of cores) func_dumps : function Function to use for function serialization (defaults to cloudpickle.dumps) func_loads : function Function to use for function deserialization (defaults to cloudpickle.loads) optimize_graph : bool If True [default], `fuse` is applied to the graph before computation. """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(num_workers, initializer=initialize_worker_process) cleanup = True else: cleanup = False # Optimize Dask dsk2, dependencies = cull(dsk, keys) if optimize_graph: dsk3, dependencies = fuse(dsk2, keys, dependencies) else: dsk3 = dsk2 # We specify marshalling functions in order to catch serialization # errors and report them to the user. loads = func_loads or _globals.get('func_loads') or _loads dumps = func_dumps or _globals.get('func_dumps') or _dumps # Note former versions used a multiprocessing Manager to share # a Queue between parent and workers, but this is fragile on Windows # (issue #1652). try: # Run result = get_async(pool.apply_async, len(pool._pool), dsk3, keys, get_id=_process_get_id, dumps=dumps, loads=loads, **kwargs) finally: if cleanup: pool.close() return result