def compute_delayed_functions(list_of_computations: List[Tuple[Delayed, Dict]],
                              client: Client, nb_of_retries_if_erred: int,
                              error_logger_name: str,
                              error_logger_file_name: str) -> None:
    print("start compute")
    print(list_of_computations)

    list_of_delayed_function_calls = [
        computation[0] for computation in list_of_computations
    ]

    list_of_futures: List[Future] = client.compute(
        list_of_delayed_function_calls, retries=nb_of_retries_if_erred)
    distributed.wait(list_of_futures)
    print("end compute")

    error_logger: Logger = create_logger(logger_name=error_logger_name,
                                         log_file_name=error_logger_file_name)
    future: Future
    for future, (delayed, func_args) in zip(list_of_futures,
                                            list_of_computations):
        if future.status == 'error':
            exception = future.exception()
            error_logger.error(f"{exception.__class__}: {exception}\n"
                               f"\tfor arguments {func_args}")
    close_logger(error_logger)
Example #2
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key
            for f in result.dask.values()
            } == {f.key
                  for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()
Example #3
0
    def submit(
        self,
        client: Client = None,
        scheduler_address: str = None,
        priority: int = None,
        resources: Dict[str, Any] = None,
        show_progress=False,
        **kwargs,
    ) -> None:

        if not priority:
            priority = self.priority

        if not resources:
            resources = self.resources

        if not client:
            client = Client(scheduler_address)

        self.scheduler_address = client.scheduler.address

        computation = client.compute(self.graph,
                                     retries=3,
                                     priority=priority,
                                     resources=resources)
        if show_progress:
            progress(computation)
        fire_and_forget(computation)
        if scheduler_address:
            client.close()
        return None
Example #4
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()
Example #5
0
def yxbt_sink_to_mem(bands: Tuple[da.Array, ...], client: Client) -> np.ndarray:
    assert client.scheduler.address.startswith("inproc://")

    b = bands[0]
    dtype = b.dtype
    nt, ny, nx = b.shape
    nb = len(bands)
    token = Cache.new((ny, nx, nb, nt), dtype)
    sinks = [_YXBTSink(str(token), idx) for idx in range(nb)]
    try:
        fut = da.store(bands, sinks, lock=False, compute=False)
        fut = client.compute(fut)
        fut.result()
        return Cache.get(token)
    finally:
        token.release()
Example #6
0
async def test_basic_state(c: Client, s: Scheduler, *workers: Worker):
    df = dd.demo.make_timeseries(freq="15D", partition_freq="30D")
    shuffled = shuffle(df, "id")

    exts: list[ShuffleWorkerExtension] = [w.extensions["shuffle"] for w in workers]
    for ext in exts:
        assert not ext.shuffles

    f = c.compute(shuffled)
    # TODO this is a bad/pointless test. the `f.done()` is necessary in case the shuffle is really fast.
    # To test state more thoroughly, we'd need a way to 'stop the world' at various stages. Like have the
    # scheduler pause everything when the barrier is reached. Not sure yet how to implement that.
    while not all(len(ext.shuffles) == 1 for ext in exts) and not f.done():
        await asyncio.sleep(0.1)

    await f
    assert all(not ext.shuffles for ext in exts)
Example #7
0
def store_to_mem(xx: da.Array,
                 client: Client,
                 out: Optional[np.ndarray] = None) -> np.ndarray:
    assert client.scheduler.address.startswith('inproc://')
    if out is None:
        sink = DataSink.new(xx.shape, xx.dtype)
    else:
        assert out.shape == xx.shape
        sink = DataSink.wrap(out)

    try:
        fut = da.store(xx, sink, lock=False, compute=False)
        fut = client.compute(fut)
        fut.result()
        return sink.data
    finally:
        sink.unlink()
Example #8
0
    def test_with_distributed_client(self):
        lc = LocalCluster(diagnostics_port=None)
        client = Client(lc)

        graph = create_graph(net1_ex_matrix,
                             net1_gene_names,
                             net1_tf_names,
                             "GBM",
                             SGBM_KWARGS,
                             target_genes=list(self.test_range),
                             client=client)

        network_df = client.compute(graph, sync=True)

        self.assertEquals(len(self.test_range),
                          len(network_df['target'].unique()))

        client.close()
        lc.close()
Example #9
0
def store_to_mem(xx: da.Array,
                 client: Client,
                 out: Optional[np.ndarray] = None) -> np.ndarray:
    assert client.scheduler.address.startswith("inproc://")
    token = None
    if out is None:
        sink = dask.delayed(CachedArray.new)(xx.shape, xx.dtype)
    else:
        assert out.shape == xx.shape
        token = Cache.put(out)
        sink = dask.delayed(CachedArray)(str(token))

    try:
        fut = da.store(xx, sink, lock=False, compute=False)
        fut, _sink = client.compute([fut, sink])
        fut.result()
        return _sink.result().data
    finally:
        if token is not None:
            token.release()
Example #10
0
def sequential_rechunk(source: Any, target: Any, slab_size: Tuple[int],
                       intermediate_chunks: Tuple[int],
                       client: distributed.Client,
                       num_workers: int) -> List[None]:
    """
    Load slabs of an array into local memory, then create a dask array and rechunk that dask array, then store into 
    chunked array storage.
    """
    results = []
    slices = da.core.slices_from_chunks(source.rechunk(slab_size).chunks)

    for sl in slices:
        arr_in = source[sl].compute(scheduler='threads')
        darr_in = da.from_array(arr_in, chunks=intermediate_chunks)
        store_op = da.store(darr_in,
                            target,
                            regions=sl,
                            compute=False,
                            lock=None)
        client.cluster.scale(num_workers)
        results.extend(client.compute(store_op).result())
        client.cluster.scale(0)
    return results
Example #11
0
    def submit(self,
               client: Client = None,
               client_address: str = None,
               priority: int = None) -> None:

        if not priority:
            priority = self.priority

        if not client:
            client = Client(client_address)
        computation = client.compute(self.graph, retries=3, priority=priority)
        fire_and_forget(computation)
        self.status = "submitted"
        if (
                not client
        ):  # if cient is provided, we assume the user will close it on their end
            client.close()

        schema = "administration"
        table = "workflow_queue"
        engine = os.getenv("QUEUE_ENGINE") or "mssql+pyodbc://redshift_acoe"
        self.submit_to_queue(engine, schema, table, priority)
        return None
Example #12
0
    skip = 1

    # We pass in the scheduler from the invoking script
    if len(sys.argv) > 1:
        scheduler = sys.argv[1]
        client = Client(scheduler)
    else:
        client = Client()

    sparse_graph_list = [
        delayed(init_sparse)(len_chunk) for i in range(nchunks)
    ]
    psf_graph_list = [
        delayed(grid_and_invert_data)(s, shape) for s in sparse_graph_list
    ]
    sum_psf_graph_rank1 = [
        delayed(numpy.sum)(psf_graph_list[i:i + nreduce])
        for i in range(0, nchunks, nreduce)
    ]
    sum_psf_graph = delayed(numpy.sum)(sum_psf_graph_rank1)

    future = client.compute(sum_psf_graph)
    psf = future.result()
    print(numpy.max(psf))

    client.close()
    print("*** Successfully reached end in %.1f seconds ***" %
          (time.time() - start))

    exit(0)
Example #13
0
from distributed import Client
from dask import delayed
import time

if __name__ == "__main__":

    c = Client()
    addr = c.scheduler_info()['address']
    services = c.scheduler_info()['services']
    if 'bokeh' in services.keys():
        bokeh_addr = 'http:%s:%s' % (addr.split(':')[1], services['bokeh'])
        print('Diagnostic pages available on port %s' % bokeh_addr)

    work = [delayed(time.sleep)(5.0 + i / 100.0) for i in range(100)]

    c.compute(work, sync=True)
    c.close()

    print('Finished')