Example #1
0
def test_workdir_simple(tmpdir):
    # Test nominal operation of WorkSpace and WorkDirs
    base_dir = str(tmpdir)
    assert_contents = functools.partial(assert_directory_contents, base_dir)

    ws = WorkSpace(base_dir)
    assert_contents([])
    a = ws.new_work_dir(name='aa')
    assert_contents(['aa', 'aa.dirlock'])
    b = ws.new_work_dir(name='bb')
    assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock'])
    ws._purge_leftovers()
    assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock'])

    a.release()
    assert_contents(['bb', 'bb.dirlock'])
    del b
    gc.collect()
    assert_contents([])

    # Generated temporary name with a prefix
    a = ws.new_work_dir(prefix='foo-')
    b = ws.new_work_dir(prefix='bar-')
    c = ws.new_work_dir(prefix='bar-')
    assert_contents({a.dir_path, a._lock_path,
                     b.dir_path, b._lock_path,
                     c.dir_path, c._lock_path})
    assert os.path.basename(a.dir_path).startswith('foo-')
    assert os.path.basename(b.dir_path).startswith('bar-')
    assert os.path.basename(c.dir_path).startswith('bar-')
    assert b.dir_path != c.dir_path
Example #2
0
def _test_workspace_concurrency(tmpdir, timeout, max_procs):
    """
    WorkSpace concurrency test.  We merely check that no exception or
    deadlock happens.
    """
    base_dir = str(tmpdir)

    err_q = mp_context.Queue()
    purged_q = mp_context.Queue()
    stop_evt = mp_context.Event()
    ws = WorkSpace(base_dir)
    # Make sure purging only happens in the child processes
    ws._purge_leftovers = lambda: None

    # Run a bunch of child processes that will try to purge concurrently
    NPROCS = 2 if sys.platform == 'win32' else max_procs
    processes = [mp_context.Process(target=_workspace_concurrency,
                                    args=(base_dir, purged_q, err_q, stop_evt))
                 for i in range(NPROCS)]
    for p in processes:
        p.start()

    n_created = 0
    n_purged = 0
    try:
        t1 = time()
        while time() - t1 < timeout:
            # Add a bunch of locks, and simulate forgetting them.
            # The concurrent processes should try to purge them.
            for i in range(50):
                d = ws.new_work_dir(prefix='workspace-concurrency-')
                d._finalizer.detach()
                n_created += 1
            sleep(1e-2)
    finally:
        stop_evt.set()
        for p in processes:
            p.join()

    # Any errors?
    try:
        err = err_q.get_nowait()
    except Empty:
        pass
    else:
        raise err

    try:
        while True:
            n_purged += purged_q.get_nowait()
    except Empty:
        pass
    # We attempted to purge most directories at some point
    assert n_purged >= 0.5 * n_created > 0
    return n_created, n_purged
Example #3
0
def test_workspace_rmtree_failure(tmpdir):
    base_dir = str(tmpdir)

    ws = WorkSpace(base_dir)
    a = ws.new_work_dir(name='aa')
    shutil.rmtree(a.dir_path)
    with captured_logger('distributed.diskutils', 'ERROR', propagate=False) as sio:
        a.release()
    lines = sio.getvalue().splitlines()
    # shutil.rmtree() may call its onerror callback several times
    assert lines
    for line in lines:
        assert line.startswith("Failed to remove %r" % (a.dir_path,))
Example #4
0
def test_locking_disabled(tmpdir):
    base_dir = str(tmpdir)

    with dask.config.set({'distributed.worker.use-file-locking': False}):
        with mock.patch('distributed.diskutils.locket.lock_file') as lock_file:
            assert_contents = functools.partial(assert_directory_contents, base_dir)

            ws = WorkSpace(base_dir)
            assert_contents([])
            a = ws.new_work_dir(name='aa')
            assert_contents(['aa'])
            b = ws.new_work_dir(name='bb')
            assert_contents(['aa', 'bb'])
            ws._purge_leftovers()
            assert_contents(['aa', 'bb'])

            a.release()
            assert_contents(['bb'])
            del b
            gc.collect()
            assert_contents([])

        lock_file.assert_not_called()
Example #5
0
def test_two_workspaces_in_same_directory(tmpdir):
    # If handling the same directory with two WorkSpace instances,
    # things should work ok too
    base_dir = str(tmpdir)
    assert_contents = functools.partial(assert_directory_contents, base_dir)

    ws = WorkSpace(base_dir)
    assert_contents([])
    a = ws.new_work_dir(name="aa")
    assert_contents(["aa", "aa.dirlock"])

    ws2 = WorkSpace(base_dir)
    ws2._purge_leftovers()
    assert_contents(["aa", "aa.dirlock"])
    b = ws.new_work_dir(name="bb")
    assert_contents(["aa", "aa.dirlock", "bb", "bb.dirlock"])

    del ws
    del b
    gc.collect()
    assert_contents(["aa", "aa.dirlock"], trials=5)
    del a
    gc.collect()
    assert_contents([], trials=5)
Example #6
0
def test_locking_disabled(tmpdir):
    base_dir = str(tmpdir)

    with dask.config.set({"distributed.worker.use-file-locking": False}):
        with mock.patch("distributed.diskutils.locket.lock_file") as lock_file:
            assert_contents = functools.partial(assert_directory_contents,
                                                base_dir)

            ws = WorkSpace(base_dir)
            assert_contents([])
            a = ws.new_work_dir(name="aa")
            assert_contents(["aa"])
            b = ws.new_work_dir(name="bb")
            assert_contents(["aa", "bb"])
            ws._purge_leftovers()
            assert_contents(["aa", "bb"])

            a.release()
            assert_contents(["bb"])
            del b
            gc.collect()
            assert_contents([])

        lock_file.assert_not_called()
def test_locking_disabled(tmpdir):
    base_dir = str(tmpdir)

    with new_config({'use-file-locking': False}):
        with mock.patch('distributed.diskutils.locket.lock_file') as lock_file:
            assert_contents = functools.partial(assert_directory_contents,
                                                base_dir)

            ws = WorkSpace(base_dir)
            assert_contents([])
            a = ws.new_work_dir(name='aa')
            assert_contents(['aa'])
            b = ws.new_work_dir(name='bb')
            assert_contents(['aa', 'bb'])
            ws._purge_leftovers()
            assert_contents(['aa', 'bb'])

            a.release()
            assert_contents(['bb'])
            del b
            gc.collect()
            assert_contents([])

        lock_file.assert_not_called()
Example #8
0
def test_two_workspaces_in_same_directory(tmpdir):
    # If handling the same directory with two WorkSpace instances,
    # things should work ok too
    base_dir = str(tmpdir)
    assert_contents = functools.partial(assert_directory_contents, base_dir)

    ws = WorkSpace(base_dir)
    assert_contents([])
    a = ws.new_work_dir(name='aa')
    assert_contents(['aa', 'aa.dirlock'])

    ws2 = WorkSpace(base_dir)
    ws2._purge_leftovers()
    assert_contents(['aa', 'aa.dirlock'])
    b = ws.new_work_dir(name='bb')
    assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock'])

    del ws
    del b
    gc.collect()
    assert_contents(['aa', 'aa.dirlock'])
    del a
    gc.collect()
    assert_contents([])
def test_workspace_concurrency(tmpdir):
    """WorkSpace concurrency test. We merely check that no exception or
    deadlock happens.
    """
    base_dir = str(tmpdir)

    err_q = mp_context.Queue()
    purged_q = mp_context.Queue()
    stop_evt = mp_context.Event()
    ws = WorkSpace(base_dir)
    # Make sure purging only happens in the child processes
    ws._purge_leftovers = lambda: None

    # Windows (or at least Windows GitHub CI) has been observed to be exceptionally
    # slow. Don't stress it too much.
    max_procs = 2 if WINDOWS else 16

    # Run a bunch of child processes that will try to purge concurrently
    barrier = mp_context.Barrier(parties=max_procs + 1)
    processes = [
        mp_context.Process(
            target=_workspace_concurrency,
            args=(base_dir, purged_q, err_q, stop_evt, barrier),
        )
        for _ in range(max_procs)
    ]
    for p in processes:
        p.start()
    barrier.wait()
    n_created = 0
    n_purged = 0
    t1 = time()
    try:
        # On Linux, you will typically end with n_created > 10.000
        # On Windows, it can take 60 seconds to create 50 locks!
        while time() - t1 < 10:
            # Add a bunch of locks and simulate forgetting them.
            # The concurrent processes should try to purge them.
            for _ in range(100):
                d = ws.new_work_dir(prefix="workspace-concurrency-")
                d._finalizer.detach()
                n_created += 1

    finally:
        stop_evt.set()
        for p in processes:
            p.join()

    # Any errors?
    try:
        err = err_q.get_nowait()
    except queue.Empty:
        pass
    else:
        raise err

    try:
        while True:
            n_purged += purged_q.get_nowait()
    except queue.Empty:
        pass

    # We attempted to purge most directories at some point
    assert n_purged >= 0.5 * n_created > 0
Example #10
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    reconnect,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    preload_argv,
    bokeh_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {"prefix": bokeh_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host:
        addr = uri_from_host_port(host, 0, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    local_dir = kwargs.get("local_dir", "dask-worker-space")
    with warn_on_duration(
            "1s",
            "Creating scratch directories is taking a surprisingly long time. "
            "This is often due to running workers on a network file system. "
            "Consider specifying a local-directory to point workers to write "
            "scratch data to a local disk.",
    ):
        _workspace = WorkSpace(os.path.abspath(local_dir))
        _workdir = _workspace.new_work_dir(prefix="worker-")
        local_dir = _workdir.dir_path

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            reconnect=reconnect,
            local_dir=local_directory,
            death_timeout=death_timeout,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            preload_argv=preload_argv,
            security=sec,
            contact_address=None,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit":
                    get_device_total_memory(index=i) if
                    (device_memory_limit == "auto" or device_memory_limit
                     == int(0)) else parse_bytes(device_memory_limit),
                    "memory_limit":
                    parse_memory_limit(memory_limit,
                                       nthreads,
                                       total_cores=nprocs),
                    "local_dir":
                    local_dir,
                },
            ),
            **kwargs,
        ) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")