Example #1
0
 def _determine_worker(self):
     try:
         get_worker()
         self.worker = True
         self.fs = filesystem(self.protocol, **self.storage_options)
     except ValueError:
         self.worker = False
         self.client = _get_global_client()
         self.rfs = dask.delayed(self)
Example #2
0
 def _determine_worker(self):
     try:
         get_worker()
         self.worker = True
         if self.fs is None:
             self.fs = filesystem(self.target_protocol,
                                  **(self.target_options or {}))
     except ValueError:
         self.worker = False
         self.client = _get_client(self.client)
         self.rfs = dask.delayed(self)
Example #3
0
 def __init__(self, name=None, client=None, maxsize=0):
     try:
         self.client = client or Client.current()
     except ValueError:
         # Initialise new client
         self.client = get_worker().client
     self.name = name or "variable-" + uuid.uuid4().hex
Example #4
0
File: base.py Project: z7ye/dask-1
def get_scheduler(get=None, scheduler=None, collections=None, cls=None):
    """Get scheduler function

    There are various ways to specify the scheduler to use:

    1.  Passing in scheduler= parameters
    2.  Passing these into global configuration
    3.  Using defaults of a dask collection

    This function centralizes the logic to determine the right scheduler to use
    from those many options
    """
    if get:
        raise TypeError(get_err_msg)

    if scheduler is not None:
        if callable(scheduler):
            return scheduler
        elif "Client" in type(scheduler).__name__ and hasattr(
                scheduler, "get"):
            return scheduler.get
        elif scheduler.lower() in named_schedulers:
            return named_schedulers[scheduler.lower()]
        elif scheduler.lower() in ("dask.distributed", "distributed"):
            from distributed.worker import get_client

            return get_client().get
        else:
            raise ValueError("Expected one of [distributed, %s]" %
                             ", ".join(sorted(named_schedulers)))
        # else:  # try to connect to remote scheduler with this name
        #     return get_client(scheduler).get

    if config.get("scheduler", None):
        return get_scheduler(scheduler=config.get("scheduler", None))

    if config.get("get", None):
        raise ValueError(get_err_msg)

    if getattr(thread_state, "key", False):
        from distributed.worker import get_worker

        return get_worker().client.get

    if cls is not None:
        return cls.__dask_scheduler__

    if collections:
        collections = [c for c in collections if c is not None]
    if collections:
        get = collections[0].__dask_scheduler__
        if not all(c.__dask_scheduler__ == get for c in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler=` parameter explicitly in compute or "
                             "globally with `dask.config.set`.")
        return get

    return None
Example #5
0
File: base.py Project: yliapis/dask
def get_scheduler(get=None, scheduler=None, collections=None, cls=None):
    """ Get scheduler function

    There are various ways to specify the scheduler to use:

    1.  Passing in scheduler= parameters
    2.  Passing these into global confiuration
    3.  Using defaults of a dask collection

    This function centralizes the logic to determine the right scheduler to use
    from those many options
    """
    if get:
        raise TypeError(get_err_msg)

    if scheduler is not None:
        if callable(scheduler):
            return scheduler
        elif "Client" in type(scheduler).__name__ and hasattr(scheduler, 'get'):
            return scheduler.get
        elif scheduler.lower() in named_schedulers:
            return named_schedulers[scheduler.lower()]
        elif scheduler.lower() in ('dask.distributed', 'distributed'):
            from distributed.worker import get_client
            return get_client().get
        elif scheduler.lower() in ['processes', 'multiprocessing']:
            raise ValueError("Please install cloudpickle to use the '%s' scheduler." % scheduler)
        else:
            raise ValueError("Expected one of [distributed, %s]" % ', '.join(sorted(named_schedulers)))
        # else:  # try to connect to remote scheduler with this name
        #     return get_client(scheduler).get

    if config.get('scheduler', None):
        return get_scheduler(scheduler=config.get('scheduler', None))

    if config.get('get', None):
        raise ValueError(get_err_msg)

    if getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        return get_worker().client.get

    if cls is not None:
        return cls.__dask_scheduler__

    if collections:
        collections = [c for c in collections if c is not None]
    if collections:
        get = collections[0].__dask_scheduler__
        if not all(c.__dask_scheduler__ == get for c in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler=` parameter explicitly in compute or "
                             "globally with `dask.config.set`.")
        return get

    return None
Example #6
0
def get_scheduler(get=None, scheduler=None, collections=None, cls=None):
    """ Get scheduler function

    There are various ways to specify the scheduler to use:

    1.  Passing in get= parameters (deprecated)
    2.  Passing in scheduler= parameters
    3.  Passing these into global confiuration
    4.  Using defaults of a dask collection

    This function centralizes the logic to determine the right scheduler to use
    from those many options
    """
    if get is not None:
        if scheduler is not None:
            raise ValueError("Both get= and scheduler= provided.  Choose one")
        warn_on_get(get)
        return get

    if scheduler is not None:
        if scheduler.lower() in named_schedulers:
            return named_schedulers[scheduler.lower()]
        elif scheduler.lower() in ('dask.distributed', 'distributed'):
            from distributed.worker import get_client
            return get_client().get
        else:
            raise ValueError("Expected one of [distributed, %s]" %
                             ', '.join(sorted(named_schedulers)))
        # else:  # try to connect to remote scheduler with this name
        #     return get_client(scheduler).get

    if config.get('scheduler', None):
        return get_scheduler(scheduler=config.get('scheduler', None))

    if config.get('get', None):
        warn_on_get(config.get('get', None))
        return config.get('get', None)

    if getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        return get_worker().client.get

    if cls is not None:
        return cls.__dask_scheduler__

    if collections:
        collections = [c for c in collections if c is not None]
    if collections:
        get = collections[0].__dask_scheduler__
        if not all(c.__dask_scheduler__ == get for c in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler=` parameter explicitly in compute or "
                             "globally with `set_options`.")
        return get

    return None
Example #7
0
 def __init__(self, name=None, client=None):
     try:
         self.client = client or Client.current()
     except ValueError:
         # Initialise new client
         self.client = get_worker().client
     self.name = name or "lock-" + uuid.uuid4().hex
     self.id = uuid.uuid4().hex
     self._locked = False
Example #8
0
def get_scheduler(get=None, scheduler=None, collections=None, cls=None):
    """ Get scheduler function

    There are various ways to specify the scheduler to use:

    1.  Passing in get= parameters (deprecated)
    2.  Passing in scheduler= parameters
    3.  Passing these into global confiuration
    4.  Using defaults of a dask collection

    This function centralizes the logic to determine the right scheduler to use
    from those many options
    """
    if get is not None:
        if scheduler is not None:
            raise ValueError("Both get= and scheduler= provided.  Choose one")
        warn_on_get(get)
        return get

    if scheduler is not None:
        if scheduler.lower() in named_schedulers:
            return named_schedulers[scheduler.lower()]
        elif scheduler.lower() in ('dask.distributed', 'distributed'):
            from distributed.worker import get_client
            return get_client().get
        else:
            raise ValueError("Expected one of [distributed, %s]" % ', '.join(sorted(named_schedulers)))
        # else:  # try to connect to remote scheduler with this name
        #     return get_client(scheduler).get

    if config.get('scheduler', None):
        return get_scheduler(scheduler=config.get('scheduler', None))

    if config.get('get', None):
        warn_on_get(config.get('get', None))
        return config.get('get', None)

    if getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        return get_worker().client.get

    if cls is not None:
        return cls.__dask_scheduler__

    if collections:
        collections = [c for c in collections if c is not None]
    if collections:
        get = collections[0].__dask_scheduler__
        if not all(c.__dask_scheduler__ == get for c in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler=` parameter explicitly in compute or "
                             "globally with `set_options`.")
        return get

    return None
Example #9
0
def worker_client(timeout=None, separate_thread=True):
    """Get client for this thread

    This context manager is intended to be called within functions that we run
    on workers.  When run as a context manager it delivers a client
    ``Client`` object that can submit other tasks directly from that worker.

    Parameters
    ----------
    timeout : Number or String
        Timeout after which to error out. Defaults to the
        ``distributed.comm.timeouts.connect`` configuration value.
    separate_thread : bool, optional
        Whether to run this function outside of the normal thread pool
        defaults to True

    Examples
    --------
    >>> def func(x):
    ...     with worker_client(timeout="10s") as c:  # connect from worker back to scheduler
    ...         a = c.submit(inc, x)     # this task can submit more tasks
    ...         b = c.submit(dec, x)
    ...         result = c.gather([a, b])  # and gather results
    ...     return result

    >>> future = client.submit(func, 1)  # submit func(1) on cluster

    See Also
    --------
    get_worker
    get_client
    secede
    """

    if timeout is None:
        timeout = dask.config.get("distributed.comm.timeouts.connect")

    timeout = dask.utils.parse_timedelta(timeout, "s")

    worker = get_worker()
    client = get_client(timeout=timeout)
    if separate_thread:
        duration = time() - thread_state.start_time
        secede()  # have this thread secede from the thread pool
        worker.loop.add_callback(
            worker.transition,
            worker.tasks[thread_state.key],
            "long-running",
            stimulus_id=f"worker-client-secede-{time()}",
            compute_duration=duration,
        )

    yield client

    if separate_thread:
        rejoin()
Example #10
0
            def f(x, sem, kill_address):
                with sem:
                    from distributed.worker import get_worker

                    worker = get_worker()
                    if worker.address == kill_address:
                        import os

                        os.kill(os.getpid(), 15)
                    return x
 async def f():
     """Trigger a memory_monitor() and reset memory_limit"""
     w = get_worker()
     # Set a host memory limit that triggers spilling to disk
     w.memory_pause_fraction = False
     memory = w.monitor.proc.memory_info().rss
     w.memory_limit = memory - 10 ** 8
     w.memory_target_fraction = 1
     await w.memory_monitor()
     # Check that host memory are freed
     assert w.monitor.proc.memory_info().rss < memory - 10 ** 7
     w.memory_limit = memory * 10  # Un-limit
Example #12
0
    def __init__(self, name=None, client=None, maxsize=0):
        try:
            self.client = client or Client.current()
        except ValueError:
            # Initialise new client
            self.client = get_worker().client
        self.name = name or "queue-" + uuid.uuid4().hex
        self.maxsize = maxsize

        if self.client.asynchronous:
            self._started = asyncio.ensure_future(self._start())
        else:
            self.client.sync(self._start)
Example #13
0
    def __init__(
        self,
        max_leases=1,
        name=None,
        register=True,
        scheduler_rpc=None,
        loop=None,
    ):

        try:
            worker = get_worker()
            self.scheduler = scheduler_rpc or worker.scheduler
            self.loop = loop or worker.loop

        except ValueError:
            client = get_client()
            self.scheduler = scheduler_rpc or client.scheduler
            self.loop = loop or client.io_loop

        self.name = name or "semaphore-" + uuid.uuid4().hex
        self.max_leases = max_leases
        self.id = uuid.uuid4().hex
        self._leases = deque()

        self.refresh_leases = True

        self._registered = None
        if register:
            self._registered = self.register()

        # this should give ample time to refresh without introducing another
        # config parameter since this *must* be smaller than the timeout anyhow
        refresh_leases_interval = (parse_timedelta(
            dask.config.get("distributed.scheduler.locks.lease-timeout"),
            default="s",
        ) / 5)
        pc = PeriodicCallback(self._refresh_leases,
                              callback_time=refresh_leases_interval * 1000)
        self.refresh_callback = pc

        # Need to start the callback using IOLoop.add_callback to ensure that the
        # PC uses the correct event loop.
        self.loop.add_callback(pc.start)
Example #14
0
 def __init__(self, cls, address, key, worker=None):
     super().__init__(key)
     self._cls = cls
     self._address = address
     self._future = None
     if worker:
         self._worker = worker
         self._client = None
     else:
         try:
             # TODO: `get_worker` may return the wrong worker instance for async local clusters (most tests)
             # when run outside of a task (when deserializing a key pointing to an Actor, etc.)
             self._worker = get_worker()
         except ValueError:
             self._worker = None
         try:
             self._client = get_client()
             self._future = Future(key, inform=self._worker is None)
             # ^ When running on a worker, only hold a weak reference to the key, otherwise the key could become unreleasable.
         except ValueError:
             self._client = None
def persist(*args, **kwargs):
    """ Persist multiple Dask collections into memory

    This turns lazy Dask collections into Dask collections with the same
    metadata, but now with their results fully computed or actively computing
    in the background.

    For example a lazy dask.array built up from many lazy calls will now be a
    dask.array of the same shape, dtype, chunks, etc., but now with all of
    those previously lazy tasks either computed in memory as many small NumPy
    arrays (in the single-machine case) or asynchronously running in the
    background on a cluster (in the distributed case).

    This function operates differently if a ``dask.distributed.Client`` exists
    and is connected to a distributed scheduler.  In this case this function
    will return as soon as the task graph has been submitted to the cluster,
    but before the computations have completed.  Computations will continue
    asynchronously in the background.  When using this function with the single
    machine scheduler it blocks until the computations have finished.

    When using Dask on a single machine you should ensure that the dataset fits
    entirely within memory.

    Examples
    --------
    >>> df = dd.read_csv('/path/to/*.csv')  # doctest: +SKIP
    >>> df = df[df.name == 'Alice']  # doctest: +SKIP
    >>> df['in-debt'] = df.balance < 0  # doctest: +SKIP
    >>> df = df.persist()  # triggers computation  # doctest: +SKIP

    >>> df.value().min()  # future computations are now fast  # doctest: +SKIP
    -10
    >>> df.value().max()  # doctest: +SKIP
    100

    >>> from dask import persist  # use persist function on multiple collections
    >>> a, b = persist(a, b)  # doctest: +SKIP

    Parameters
    ----------
    *args: Dask collections
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default
        is to check the global settings first, and then fall back to
        the collection defaults.
    optimize_graph : bool, optional
        If True [default], the graph is optimized before computation.
        Otherwise the graph is run as is. This can be useful for debugging.
    **kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Returns
    -------
    New dask collections backed by in-memory data
    """
    collections = [a for a in args if is_dask_collection(a)]
    if not collections:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if get is None and getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        get = get_worker().client.get

    if inspect.ismethod(get):
        try:
            from distributed.client import default_client
        except ImportError:
            pass
        else:
            try:
                client = default_client()
            except ValueError:
                pass
            else:
                if client.get == _globals['get']:
                    collections = client.persist(collections, **kwargs)
                    if isinstance(collections,
                                  list):  # distributed is inconsistent here
                        collections = tuple(collections)
                    else:
                        collections = (collections, )
                    results_iter = iter(collections)
                    return tuple(
                        a if not is_dask_collection(a) else next(results_iter)
                        for a in args)

    optimize_graph = kwargs.pop('optimize_graph', True)

    if not get:
        get = collections[0].__dask_scheduler__
        if not all(a.__dask_scheduler__ == get for a in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = collections_to_dsk(collections, optimize_graph, **kwargs)

    keys, postpersists = [], []
    for a in args:
        if is_dask_collection(a):
            a_keys = list(flatten(a.__dask_keys__()))
            rebuild, state = a.__dask_postpersist__()
            keys.extend(a_keys)
            postpersists.append((rebuild, a_keys, state))
        else:
            postpersists.append((None, None, a))

    results = get(dsk, keys, **kwargs)
    d = dict(zip(keys, results))
    return tuple(s if r is None else r({k: d[k]
                                        for k in ks}, *s)
                 for r, ks, s in postpersists)
def compute(*args, **kwargs):
    """Compute several dask collections at once.

    Parameters
    ----------
    args : object
        Any number of objects. If it is a dask object, it's computed and the
        result is returned. By default, python builtin collections are also
        traversed to look for dask objects (for more information see the
        ``traverse`` keyword). Non-dask arguments are passed through unchanged.
    traverse : bool, optional
        By default dask traverses builtin python collections looking for dask
        objects passed to ``compute``. For large collections this can be
        expensive. If none of the arguments contain any dask objects, set
        ``traverse=False`` to avoid doing this traversal.
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default is
        to check the global settings first, and then fall back to defaults for
        the collections.
    optimize_graph : bool, optional
        If True [default], the optimizations for each collection are applied
        before computation. Otherwise the graph is run as is. This can be
        useful for debugging.
    kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Examples
    --------
    >>> import dask.array as da
    >>> a = da.arange(10, chunks=2).sum()
    >>> b = da.arange(10, chunks=2).mean()
    >>> compute(a, b)
    (45, 4.5)

    By default, dask objects inside python collections will also be computed:

    >>> compute({'a': a, 'b': b, 'c': 1})  # doctest: +SKIP
    ({'a': 45, 'b': 4.5, 'c': 1},)
    """
    from dask.delayed import delayed
    traverse = kwargs.pop('traverse', True)
    if traverse:
        args = tuple(
            delayed(a) if isinstance(a, (list, set, tuple, dict,
                                         Iterator)) else a for a in args)

    optimize_graph = kwargs.pop('optimize_graph', True)
    variables = [a for a in args if is_dask_collection(a)]
    if not variables:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if get is None and getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        get = get_worker().client.get

    if not get:
        get = variables[0].__dask_scheduler__
        if not all(a.__dask_scheduler__ == get for a in variables):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = collections_to_dsk(variables, optimize_graph, **kwargs)
    keys = [var.__dask_keys__() for var in variables]
    postcomputes = [
        a.__dask_postcompute__() if is_dask_collection(a) else (None, a)
        for a in args
    ]
    results = get(dsk, keys, **kwargs)
    results_iter = iter(results)
    return tuple(a if f is None else f(next(results_iter), *a)
                 for f, a in postcomputes)
Example #17
0
def persist(*args, **kwargs):
    """ Persist multiple Dask collections into memory

    This turns lazy Dask collections into Dask collections with the same
    metadata, but now with their results fully computed or actively computing
    in the background.

    For example a lazy dask.array built up from many lazy calls will now be a
    dask.array of the same shape, dtype, chunks, etc., but now with all of
    those previously lazy tasks either computed in memory as many small :class:`numpy.array`
    (in the single-machine case) or asynchronously running in the
    background on a cluster (in the distributed case).

    This function operates differently if a ``dask.distributed.Client`` exists
    and is connected to a distributed scheduler.  In this case this function
    will return as soon as the task graph has been submitted to the cluster,
    but before the computations have completed.  Computations will continue
    asynchronously in the background.  When using this function with the single
    machine scheduler it blocks until the computations have finished.

    When using Dask on a single machine you should ensure that the dataset fits
    entirely within memory.

    Examples
    --------
    >>> df = dd.read_csv('/path/to/*.csv')  # doctest: +SKIP
    >>> df = df[df.name == 'Alice']  # doctest: +SKIP
    >>> df['in-debt'] = df.balance < 0  # doctest: +SKIP
    >>> df = df.persist()  # triggers computation  # doctest: +SKIP

    >>> df.value().min()  # future computations are now fast  # doctest: +SKIP
    -10
    >>> df.value().max()  # doctest: +SKIP
    100

    >>> from dask import persist  # use persist function on multiple collections
    >>> a, b = persist(a, b)  # doctest: +SKIP

    Parameters
    ----------
    *args: Dask collections
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default
        is to check the global settings first, and then fall back to
        the collection defaults.
    optimize_graph : bool, optional
        If True [default], the graph is optimized before computation.
        Otherwise the graph is run as is. This can be useful for debugging.
    **kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Returns
    -------
    New dask collections backed by in-memory data
    """
    collections = [a for a in args if is_dask_collection(a)]
    if not collections:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if get is None and getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        get = get_worker().client.get

    if inspect.ismethod(get):
        try:
            from distributed.client import default_client
        except ImportError:
            pass
        else:
            try:
                client = default_client()
            except ValueError:
                pass
            else:
                if client.get == _globals['get']:
                    collections = client.persist(collections, **kwargs)
                    if isinstance(collections, list):  # distributed is inconsistent here
                        collections = tuple(collections)
                    else:
                        collections = (collections,)
                    results_iter = iter(collections)
                    return tuple(a if not is_dask_collection(a)
                                 else next(results_iter)
                                 for a in args)

    optimize_graph = kwargs.pop('optimize_graph', True)

    if not get:
        get = collections[0].__dask_scheduler__
        if not all(a.__dask_scheduler__ == get for a in collections):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = collections_to_dsk(collections, optimize_graph, **kwargs)

    keys, postpersists = [], []
    for a in args:
        if is_dask_collection(a):
            a_keys = list(flatten(a.__dask_keys__()))
            rebuild, state = a.__dask_postpersist__()
            keys.extend(a_keys)
            postpersists.append((rebuild, a_keys, state))
        else:
            postpersists.append((None, None, a))

    results = get(dsk, keys, **kwargs)
    d = dict(zip(keys, results))
    return tuple(s if r is None else r({k: d[k] for k in ks}, *s)
                 for r, ks, s in postpersists)
Example #18
0
def compute(*args, **kwargs):
    """Compute several dask collections at once.

    Parameters
    ----------
    args : object
        Any number of objects. If it is a dask object, it's computed and the
        result is returned. By default, python builtin collections are also
        traversed to look for dask objects (for more information see the
        ``traverse`` keyword). Non-dask arguments are passed through unchanged.
    traverse : bool, optional
        By default dask traverses builtin python collections looking for dask
        objects passed to ``compute``. For large collections this can be
        expensive. If none of the arguments contain any dask objects, set
        ``traverse=False`` to avoid doing this traversal.
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default is
        to check the global settings first, and then fall back to defaults for
        the collections.
    optimize_graph : bool, optional
        If True [default], the optimizations for each collection are applied
        before computation. Otherwise the graph is run as is. This can be
        useful for debugging.
    kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Examples
    --------
    >>> import dask.array as da
    >>> a = da.arange(10, chunks=2).sum()
    >>> b = da.arange(10, chunks=2).mean()
    >>> compute(a, b)
    (45, 4.5)

    By default, dask objects inside python collections will also be computed:

    >>> compute({'a': a, 'b': b, 'c': 1})  # doctest: +SKIP
    ({'a': 45, 'b': 4.5, 'c': 1},)
    """
    from dask.delayed import delayed
    traverse = kwargs.pop('traverse', True)
    if traverse:
        args = tuple(delayed(a)
                     if isinstance(a, (list, set, tuple, dict, Iterator))
                     else a for a in args)

    optimize_graph = kwargs.pop('optimize_graph', True)
    variables = [a for a in args if is_dask_collection(a)]
    if not variables:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if get is None and getattr(thread_state, 'key', False):
        from distributed.worker import get_worker
        get = get_worker().client.get

    if not get:
        get = variables[0].__dask_scheduler__
        if not all(a.__dask_scheduler__ == get for a in variables):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = collections_to_dsk(variables, optimize_graph, **kwargs)
    keys = [var.__dask_keys__() for var in variables]
    postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
                    else (None, a) for a in args]
    results = get(dsk, keys, **kwargs)
    results_iter = iter(results)
    return tuple(a if f is None else f(next(results_iter), *a)
                 for f, a in postcomputes)
Example #19
0
def get_scheduler(get=None, scheduler=None, collections=None, cls=None):
    """Get scheduler function

    There are various ways to specify the scheduler to use:

    1.  Passing in scheduler= parameters
    2.  Passing these into global configuration
    3.  Using defaults of a dask collection

    This function centralizes the logic to determine the right scheduler to use
    from those many options
    """
    if get:
        raise TypeError(get_err_msg)

    if scheduler is not None:
        if callable(scheduler):
            return scheduler
        elif "Client" in type(scheduler).__name__ and hasattr(scheduler, "get"):
            return scheduler.get
        elif isinstance(scheduler, str):
            scheduler = scheduler.lower()

            if scheduler in named_schedulers:
                if config.get("scheduler", None) in ("dask.distributed", "distributed"):
                    warnings.warn(
                        "Running on a single-machine scheduler when a distributed client "
                        "is active might lead to unexpected results."
                    )
                return named_schedulers[scheduler]
            elif scheduler in ("dask.distributed", "distributed"):
                from distributed.worker import get_client

                return get_client().get
            else:
                raise ValueError(
                    "Expected one of [distributed, %s]"
                    % ", ".join(sorted(named_schedulers))
                )
        elif isinstance(scheduler, Executor):
            # Get `num_workers` from `Executor`'s `_max_workers` attribute.
            # If undefined, fallback to `config` or worst case CPU_COUNT.
            num_workers = getattr(scheduler, "_max_workers", None)
            if num_workers is None:
                num_workers = config.get("num_workers", CPU_COUNT)
            assert isinstance(num_workers, Integral) and num_workers > 0
            return partial(local.get_async, scheduler.submit, num_workers)
        else:
            raise ValueError("Unexpected scheduler: %s" % repr(scheduler))
        # else:  # try to connect to remote scheduler with this name
        #     return get_client(scheduler).get

    if config.get("scheduler", None):
        return get_scheduler(scheduler=config.get("scheduler", None))

    if config.get("get", None):
        raise ValueError(get_err_msg)

    if getattr(thread_state, "key", False):
        from distributed.worker import get_worker

        return get_worker().client.get

    if cls is not None:
        return cls.__dask_scheduler__

    if collections:
        collections = [c for c in collections if c is not None]
    if collections:
        get = collections[0].__dask_scheduler__
        if not all(c.__dask_scheduler__ == get for c in collections):
            raise ValueError(
                "Compute called on multiple collections with "
                "differing default schedulers. Please specify a "
                "scheduler=` parameter explicitly in compute or "
                "globally with `dask.config.set`."
            )
        return get

    return None
Example #20
0
def _get_current_task_state() -> Optional[TaskState]:
    worker = get_worker()
    logger.debug("current worker %s", f"{worker=}")
    current_task = worker.get_current_task()
    logger.debug("current task %s", f"{current_task=}")
    return worker.tasks.get(current_task)
Example #21
0
 def from_dask_worker(cls, log: str) -> "TaskLogEvent":
     return cls(job_id=get_worker().get_current_task(), log=log)
Example #22
0
 def from_dask_worker(cls, progress: float) -> "TaskProgressEvent":
     return cls(job_id=get_worker().get_current_task(), progress=progress)
Example #23
0
 def from_dask_worker(
     cls, state: RunningState, msg: Optional[str] = None
 ) -> "TaskStateEvent":
     return cls(job_id=get_worker().get_current_task(), state=state, msg=msg)