Example #1
0
def test_get_scheduler():
    assert get_scheduler() is None
    assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler(scheduler='sync') is dask.local.get_sync
    with dask.config.set(scheduler='threads'):
        assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler() is None
Example #2
0
def test_get_scheduler():
    assert get_scheduler() is None
    assert get_scheduler(scheduler="threads") is dask.threaded.get
    assert get_scheduler(scheduler="sync") is dask.local.get_sync
    with dask.config.set(scheduler="threads"):
        assert get_scheduler(scheduler="threads") is dask.threaded.get
    assert get_scheduler() is None
Example #3
0
def test_get_scheduler():
    assert get_scheduler() is None
    assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler(scheduler='sync') is dask.local.get_sync
    with dask.set_options(scheduler='threads'):
        assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler() is None
Example #4
0
def test_get_scheduler_with_distributed_active():

    with dask.config.set(scheduler="dask.distributed"):
        warning_message = (
            "Running on a single-machine scheduler when a distributed client "
            "is active might lead to unexpected results."
        )
        with pytest.warns(UserWarning, match=warning_message) as user_warnings_a:
            get_scheduler(scheduler="threads")
            get_scheduler(scheduler="sync")
        assert len(user_warnings_a) == 2
Example #5
0
def _get_scheduler(get=None, collection=None):
    """ Determine the dask scheduler that is being used.

    None is returned if not dask scheduler is active.

    See also
    --------
    dask.base.get_scheduler
    """
    try:
        # dask 0.18.1 and later
        from dask.base import get_scheduler
        actual_get = get_scheduler(get, collection)
    except ImportError:
        try:
            from dask.utils import effective_get
            actual_get = effective_get(get, collection)
        except ImportError:
            return None

    try:
        from dask.distributed import Client
        if isinstance(actual_get.__self__, Client):
            return 'distributed'
    except (ImportError, AttributeError):
        try:
            import dask.multiprocessing
            if actual_get == dask.multiprocessing.get:
                return 'multiprocessing'
            else:
                return 'threaded'
        except ImportError:
            return 'threaded'
Example #6
0
def _get_scheduler(get=None, collection=None):
    """Determine the dask scheduler that is being used.

    None is returned if no dask scheduler is active.

    See also
    --------
    dask.base.get_scheduler
    """
    try:
        # dask 0.18.1 and later
        from dask.base import get_scheduler
        actual_get = get_scheduler(get, collection)
    except ImportError:
        try:
            from dask.utils import effective_get
            actual_get = effective_get(get, collection)
        except ImportError:
            return None

    try:
        from dask.distributed import Client
        if isinstance(actual_get.__self__, Client):
            return 'distributed'
    except (ImportError, AttributeError):
        try:
            import dask.multiprocessing
            if actual_get == dask.multiprocessing.get:
                return 'multiprocessing'
            else:
                return 'threaded'
        except ImportError:
            return 'threaded'
Example #7
0
def assert_divisions(ddf, scheduler=None):
    if not hasattr(ddf, "divisions"):
        return

    assert isinstance(ddf.divisions, tuple)

    if not getattr(ddf, "known_divisions", False):
        return

    def index(x):
        if is_index_like(x):
            return x
        try:
            return x.index.get_level_values(0)
        except AttributeError:
            return x.index

    get = get_scheduler(scheduler=scheduler, collections=[type(ddf)])
    results = get(ddf.dask, ddf.__dask_keys__())
    for i, df in enumerate(results[:-1]):
        if len(df):
            assert index(df).min() >= ddf.divisions[i]
            assert index(df).max() < ddf.divisions[i + 1]

    if len(results[-1]):
        assert index(results[-1]).min() >= ddf.divisions[-2]
        assert index(results[-1]).max() <= ddf.divisions[-1]
Example #8
0
def test_get_scheduler():
    assert get_scheduler() is None
    assert get_scheduler(scheduler=dask.local.get_sync) is dask.local.get_sync
    assert get_scheduler(scheduler="threads") is dask.threaded.get
    assert get_scheduler(scheduler="sync") is dask.local.get_sync
    assert callable(get_scheduler(scheduler=dask.local.synchronous_executor))
    assert callable(get_scheduler(scheduler=MyExecutor()))
    with dask.config.set(scheduler="threads"):
        assert get_scheduler() is dask.threaded.get
    assert get_scheduler() is None
Example #9
0
def get_scheduler_lock(get=None, collection=None, scheduler=None):
    """Get an instance of the appropriate lock for a certain situation based on
       scheduler used."""
    from . import multiprocessing
    from .base import get_scheduler
    actual_get = get_scheduler(get=get,
                               collections=[collection],
                               scheduler=scheduler)

    if actual_get == multiprocessing.get:
        return mp.Manager().Lock()

    return SerializableLock()
Example #10
0
def get_scheduler_lock(get=None, collection=None, scheduler=None):
    """Get an instance of the appropriate lock for a certain situation based on
       scheduler used."""
    from . import multiprocessing
    from .base import get_scheduler
    actual_get = get_scheduler(get=get,
                               collections=[collection],
                               scheduler=scheduler)

    if actual_get == multiprocessing.get:
        return mp.Manager().Lock()

    return SerializableLock()
Example #11
0
def _get_scheduler(get=None, collection=None) -> Optional[str]:
    """Determine the dask scheduler that is being used.

    None is returned if no dask scheduler is active.

    See also
    --------
    dask.base.get_scheduler
    """
    try:
        # Fix for bug caused by dask installation that doesn't involve the toolz library
        # Issue: 4164
        import dask
        from dask.base import get_scheduler  # noqa: F401

        actual_get = get_scheduler(get, collection)
    except ImportError:
        return None

    try:
        from dask.distributed import Client

        if isinstance(actual_get.__self__, Client):
            return "distributed"
    except (ImportError, AttributeError):
        pass

    try:
        # As of dask=2.6, dask.multiprocessing requires cloudpickle to be installed
        # Dependency removed in https://github.com/dask/dask/pull/5511
        if actual_get is dask.multiprocessing.get:
            return "multiprocessing"
    except AttributeError:
        pass

    return "threaded"
Example #12
0
def effective_get(get=None, collection=None):
    """ Deprecated: see dask.base.get_scheduler """
    warnings.warn("Deprecated, see dask.base.get_scheduler instead")

    from dask.base import get_scheduler
    return get_scheduler(get=get, collections=[collection])
Example #13
0
def effective_get(get=None, collection=None):
    """ Deprecated: see dask.base.get_scheduler """
    warnings.warn("Deprecated, see dask.base.get_scheduler instead")

    from dask.base import get_scheduler
    return get_scheduler(get=get, collections=[collection])
Example #14
0
def to_hdf(
    df,
    path,
    key,
    mode="a",
    append=False,
    scheduler=None,
    name_function=None,
    compute=True,
    lock=None,
    dask_kwargs={},
    **kwargs,
):
    """Store Dask Dataframe to Hierarchical Data Format (HDF) files

    This is a parallel version of the Pandas function of the same name.  Please
    see the Pandas docstring for more detailed information about shared keyword
    arguments.

    This function differs from the Pandas version by saving the many partitions
    of a Dask DataFrame in parallel, either to many files, or to many datasets
    within the same file.  You may specify this parallelism with an asterix
    ``*`` within the filename or datapath, and an optional ``name_function``.
    The asterix will be replaced with an increasing sequence of integers
    starting from ``0`` or with the result of calling ``name_function`` on each
    of those integers.

    This function only supports the Pandas ``'table'`` format, not the more
    specialized ``'fixed'`` format.

    Parameters
    ----------
    path : string, pathlib.Path
        Path to a target filename. Supports strings, ``pathlib.Path``, or any
        object implementing the ``__fspath__`` protocol. May contain a ``*`` to
        denote many filenames.
    key : string
        Datapath within the files.  May contain a ``*`` to denote many locations
    name_function : function
        A function to convert the ``*`` in the above options to a string.
        Should take in a number from 0 to the number of partitions and return a
        string. (see examples below)
    compute : bool
        Whether or not to execute immediately.  If False then this returns a
        ``dask.Delayed`` value.
    lock : bool, Lock, optional
        Lock to use to prevent concurrency issues.  By default a
        ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock``
        will be used depending on your scheduler if a lock is required. See
        dask.utils.get_scheduler_lock for more information about lock
        selection.
    scheduler : string
        The scheduler to use, like "threads" or "processes"
    **other:
        See pandas.to_hdf for more information

    Examples
    --------
    Save Data to a single file

    >>> df.to_hdf('output.hdf', '/data')            # doctest: +SKIP

    Save data to multiple datapaths within the same file:

    >>> df.to_hdf('output.hdf', '/data-*')          # doctest: +SKIP

    Save data to multiple files:

    >>> df.to_hdf('output-*.hdf', '/data')          # doctest: +SKIP

    Save data to multiple files, using the multiprocessing scheduler:

    >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP

    Specify custom naming scheme.  This writes files as
    '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc..

    >>> from datetime import date, timedelta
    >>> base = date(year=2000, month=1, day=1)
    >>> def name_function(i):
    ...     ''' Convert integer 0 to n to a string '''
    ...     return base + timedelta(days=i)

    >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP

    Returns
    -------
    filenames : list
        Returned if ``compute`` is True. List of file names that each partition
        is saved to.
    delayed : dask.Delayed
        Returned if ``compute`` is False. Delayed object to execute ``to_hdf``
        when computed.

    See Also
    --------
    read_hdf:
    to_parquet:
    """
    name = "to-hdf-" + uuid.uuid1().hex

    pd_to_hdf = getattr(df._partition_type, "to_hdf")

    single_file = True
    single_node = True

    path = stringify_path(path)

    # if path is string, format using i_name
    if isinstance(path, str):
        if path.count("*") + key.count("*") > 1:
            raise ValueError(
                "A maximum of one asterisk is accepted in file path and dataset key"
            )

        fmt_obj = lambda path, i_name: path.replace("*", i_name)

        if "*" in path:
            single_file = False
    else:
        if key.count("*") > 1:
            raise ValueError("A maximum of one asterisk is accepted in dataset key")

        fmt_obj = lambda path, _: path

    if "*" in key:
        single_node = False

    if "format" in kwargs and kwargs["format"] not in ["t", "table"]:
        raise ValueError("Dask only support 'table' format in hdf files.")

    if mode not in ("a", "w", "r+"):
        raise ValueError("Mode must be one of 'a', 'w' or 'r+'")

    if name_function is None:
        name_function = build_name_function(df.npartitions - 1)

    # we guarantee partition order is preserved when its saved and read
    # so we enforce name_function to maintain the order of its input.
    if not (single_file and single_node):
        formatted_names = [name_function(i) for i in range(df.npartitions)]
        if formatted_names != sorted(formatted_names):
            warn(
                "To preserve order between partitions name_function "
                "must preserve the order of its input"
            )

    # If user did not specify scheduler and write is sequential default to the
    # sequential scheduler. otherwise let the _get method choose the scheduler
    if (
        scheduler is None
        and not config.get("scheduler", None)
        and single_node
        and single_file
    ):
        scheduler = "single-threaded"

    # handle lock default based on whether we're writing to a single entity
    _actual_get = get_scheduler(collections=[df], scheduler=scheduler)
    if lock is None:
        if not single_node:
            lock = True
        elif not single_file and _actual_get is not MP_GET:
            # if we're writing to multiple files with the multiprocessing
            # scheduler we don't need to lock
            lock = True
        else:
            lock = False
    if lock:
        lock = get_scheduler_lock(df, scheduler=scheduler)

    kwargs.update({"format": "table", "mode": mode, "append": append})

    dsk = dict()

    i_name = name_function(0)
    dsk[(name, 0)] = (
        _pd_to_hdf,
        pd_to_hdf,
        lock,
        [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)],
        kwargs,
    )

    kwargs2 = kwargs.copy()
    if single_file:
        kwargs2["mode"] = "a"
    if single_node:
        kwargs2["append"] = True

    filenames = []
    for i in range(0, df.npartitions):
        i_name = name_function(i)
        filenames.append(fmt_obj(path, i_name))

    for i in range(1, df.npartitions):
        i_name = name_function(i)
        task = (
            _pd_to_hdf,
            pd_to_hdf,
            lock,
            [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)],
            kwargs2,
        )
        if single_file:
            link_dep = i - 1 if single_node else 0
            task = (_link, (name, link_dep), task)
        dsk[(name, i)] = task

    dsk = merge(df.dask, dsk)
    if single_file and single_node:
        keys = [(name, df.npartitions - 1)]
    else:
        keys = [(name, i) for i in range(df.npartitions)]

    if compute:
        compute_as_if_collection(
            DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs
        )
        return filenames
    else:
        return delayed([Delayed(k, dsk) for k in keys])