def compute_delayed_functions(list_of_computations: List[Tuple[Delayed, Dict]], client: Client, nb_of_retries_if_erred: int, error_logger_name: str, error_logger_file_name: str) -> None: print("start compute") print(list_of_computations) list_of_delayed_function_calls = [ computation[0] for computation in list_of_computations ] list_of_futures: List[Future] = client.compute( list_of_delayed_function_calls, retries=nb_of_retries_if_erred) distributed.wait(list_of_futures) print("end compute") error_logger: Logger = create_logger(logger_name=error_logger_name, log_file_name=error_logger_file_name) future: Future for future, (delayed, func_args) in zip(list_of_futures, list_of_computations): if future.status == 'error': exception = future.exception() error_logger.error(f"{exception.__class__}: {exception}\n" f"\tfor arguments {func_args}") close_logger(error_logger)
def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c._publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f._get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values() } == {f.key for f in bagp.dask.values()} out = yield f.compute(result)._result() assert out == [0, 1, 2] yield c._shutdown() yield f._shutdown()
def submit( self, client: Client = None, scheduler_address: str = None, priority: int = None, resources: Dict[str, Any] = None, show_progress=False, **kwargs, ) -> None: if not priority: priority = self.priority if not resources: resources = self.resources if not client: client = Client(scheduler_address) self.scheduler_address = client.scheduler.address computation = client.compute(self.graph, retries=3, priority=priority, resources=resources) if show_progress: progress(computation) fire_and_forget(computation) if scheduler_address: client.close() return None
def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c._publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f._get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()} out = yield f.compute(result)._result() assert out == [0, 1, 2] yield c._shutdown() yield f._shutdown()
def yxbt_sink_to_mem(bands: Tuple[da.Array, ...], client: Client) -> np.ndarray: assert client.scheduler.address.startswith("inproc://") b = bands[0] dtype = b.dtype nt, ny, nx = b.shape nb = len(bands) token = Cache.new((ny, nx, nb, nt), dtype) sinks = [_YXBTSink(str(token), idx) for idx in range(nb)] try: fut = da.store(bands, sinks, lock=False, compute=False) fut = client.compute(fut) fut.result() return Cache.get(token) finally: token.release()
async def test_basic_state(c: Client, s: Scheduler, *workers: Worker): df = dd.demo.make_timeseries(freq="15D", partition_freq="30D") shuffled = shuffle(df, "id") exts: list[ShuffleWorkerExtension] = [w.extensions["shuffle"] for w in workers] for ext in exts: assert not ext.shuffles f = c.compute(shuffled) # TODO this is a bad/pointless test. the `f.done()` is necessary in case the shuffle is really fast. # To test state more thoroughly, we'd need a way to 'stop the world' at various stages. Like have the # scheduler pause everything when the barrier is reached. Not sure yet how to implement that. while not all(len(ext.shuffles) == 1 for ext in exts) and not f.done(): await asyncio.sleep(0.1) await f assert all(not ext.shuffles for ext in exts)
def store_to_mem(xx: da.Array, client: Client, out: Optional[np.ndarray] = None) -> np.ndarray: assert client.scheduler.address.startswith('inproc://') if out is None: sink = DataSink.new(xx.shape, xx.dtype) else: assert out.shape == xx.shape sink = DataSink.wrap(out) try: fut = da.store(xx, sink, lock=False, compute=False) fut = client.compute(fut) fut.result() return sink.data finally: sink.unlink()
def test_with_distributed_client(self): lc = LocalCluster(diagnostics_port=None) client = Client(lc) graph = create_graph(net1_ex_matrix, net1_gene_names, net1_tf_names, "GBM", SGBM_KWARGS, target_genes=list(self.test_range), client=client) network_df = client.compute(graph, sync=True) self.assertEquals(len(self.test_range), len(network_df['target'].unique())) client.close() lc.close()
def store_to_mem(xx: da.Array, client: Client, out: Optional[np.ndarray] = None) -> np.ndarray: assert client.scheduler.address.startswith("inproc://") token = None if out is None: sink = dask.delayed(CachedArray.new)(xx.shape, xx.dtype) else: assert out.shape == xx.shape token = Cache.put(out) sink = dask.delayed(CachedArray)(str(token)) try: fut = da.store(xx, sink, lock=False, compute=False) fut, _sink = client.compute([fut, sink]) fut.result() return _sink.result().data finally: if token is not None: token.release()
def sequential_rechunk(source: Any, target: Any, slab_size: Tuple[int], intermediate_chunks: Tuple[int], client: distributed.Client, num_workers: int) -> List[None]: """ Load slabs of an array into local memory, then create a dask array and rechunk that dask array, then store into chunked array storage. """ results = [] slices = da.core.slices_from_chunks(source.rechunk(slab_size).chunks) for sl in slices: arr_in = source[sl].compute(scheduler='threads') darr_in = da.from_array(arr_in, chunks=intermediate_chunks) store_op = da.store(darr_in, target, regions=sl, compute=False, lock=None) client.cluster.scale(num_workers) results.extend(client.compute(store_op).result()) client.cluster.scale(0) return results
def submit(self, client: Client = None, client_address: str = None, priority: int = None) -> None: if not priority: priority = self.priority if not client: client = Client(client_address) computation = client.compute(self.graph, retries=3, priority=priority) fire_and_forget(computation) self.status = "submitted" if ( not client ): # if cient is provided, we assume the user will close it on their end client.close() schema = "administration" table = "workflow_queue" engine = os.getenv("QUEUE_ENGINE") or "mssql+pyodbc://redshift_acoe" self.submit_to_queue(engine, schema, table, priority) return None
skip = 1 # We pass in the scheduler from the invoking script if len(sys.argv) > 1: scheduler = sys.argv[1] client = Client(scheduler) else: client = Client() sparse_graph_list = [ delayed(init_sparse)(len_chunk) for i in range(nchunks) ] psf_graph_list = [ delayed(grid_and_invert_data)(s, shape) for s in sparse_graph_list ] sum_psf_graph_rank1 = [ delayed(numpy.sum)(psf_graph_list[i:i + nreduce]) for i in range(0, nchunks, nreduce) ] sum_psf_graph = delayed(numpy.sum)(sum_psf_graph_rank1) future = client.compute(sum_psf_graph) psf = future.result() print(numpy.max(psf)) client.close() print("*** Successfully reached end in %.1f seconds ***" % (time.time() - start)) exit(0)
from distributed import Client from dask import delayed import time if __name__ == "__main__": c = Client() addr = c.scheduler_info()['address'] services = c.scheduler_info()['services'] if 'bokeh' in services.keys(): bokeh_addr = 'http:%s:%s' % (addr.split(':')[1], services['bokeh']) print('Diagnostic pages available on port %s' % bokeh_addr) work = [delayed(time.sleep)(5.0 + i / 100.0) for i in range(100)] c.compute(work, sync=True) c.close() print('Finished')