Exemple #1
0
def test_monitor_resources():
    pytest.importorskip('psutil')
    c = Center(ip='127.0.0.1')
    c.listen(0)
    n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')

    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    assert n.process.is_alive()
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert isinstance(d['timestamp'], datetime)

    stream = yield connect(ip=n.ip, port=n.port)
    yield write(stream, {'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield read(stream)
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    stream.close()
    yield n._close()
    c.stop()
Exemple #2
0
def test_nanny(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)

    yield n._start(0)
    with rpc(ip=n.ip, port=n.port) as nn:
        assert isalive(n.process)  # alive
        assert s.ncores[n.worker_address] == 2

        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.kill()
        assert not n.process
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info

        yield nn.kill()
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info
        assert not n.process

        yield nn.instantiate()
        assert isalive(n.process)
        assert s.ncores[n.worker_address] == 2
        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.terminate()
        assert not n.process

    yield n._close()
def test_worker_who_has_clears_after_failed_connection(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    futures = c.map(slowinc, range(20), delay=0.01,
                    key=['f%d' % i for i in range(20)])
    yield wait(futures)

    result = yield c.submit(sum, futures, workers=a.address)
    for dep in set(a.dep_state) - set(a.task_state):
        a.release_dep(dep, report=True)

    n_worker_address = n.worker_address
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[n_worker_address])

    while len(s.workers) > 2:
        yield gen.sleep(0.01)

    total = c.submit(sum, futures, workers=a.address)
    yield total

    assert not a.has_what.get(n_worker_address)
    assert not any(n_worker_address in s for s in a.who_has.values())

    yield n._close()
Exemple #4
0
def test_nanny_death_timeout(s):
    yield s.close()
    w = Nanny(s.address, death_timeout=1)
    yield w._start()

    yield gen.sleep(3)
    assert w.status == 'closed'
Exemple #5
0
def test_nanny():
    c = Center('127.0.0.1')
    c.listen(0)
    n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')

    yield n._start(0)
    nn = rpc(ip=n.ip, port=n.port)
    assert n.process.is_alive()
    assert c.ncores[n.worker_address] == 2
    assert c.worker_services[n.worker_address]['nanny'] > 1024

    yield nn.kill()
    assert n.worker_address not in c.ncores
    assert n.worker_address not in c.worker_services
    assert not n.process

    yield nn.kill()
    assert n.worker_address not in c.ncores
    assert n.worker_address not in c.worker_services
    assert not n.process

    yield nn.instantiate()
    assert n.process.is_alive()
    assert c.ncores[n.worker_address] == 2
    assert c.worker_services[n.worker_address]['nanny'] > 1024

    yield nn.terminate()
    assert not n.process

    if n.process:
        n.process.terminate()

    yield n._close()
    c.stop()
Exemple #6
0
def test_many_kills(s):
    n = Nanny(s.address, ncores=2, loop=s.loop)
    yield n._start(0)
    assert n.is_alive()
    yield [n.kill() for i in range(5)]
    yield [n.kill() for i in range(5)]
    yield n._close()
Exemple #7
0
def become_distributed_worker(ip, port, nanny=False, **kwargs):
    """Task function for becoming a distributed Worker

    Parameters
    ----------

    ip: str
        The IP address of the Scheduler.
    port: int
        The port of the Scheduler.
    **kwargs:
        Any additional keyword arguments will be passed to the Worker constructor.
    """
    shell = get_ipython()
    kernel = shell.kernel
    if getattr(kernel, 'distributed_worker', None) is not None:
        kernel.log.info("Distributed worker is already running.")
        return
    from distributed import Worker, Nanny
    if nanny:
        w = Nanny(ip, port, **kwargs)
    else:
        w = Worker(ip, port, **kwargs)
    shell.user_ns['distributed_worker'] = kernel.distributed_worker = w
    w.start(0)
def test_broken_worker_during_computation(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Exemple #9
0
def test_scheduler_file():
    with tmpfile() as fn:
        s = Scheduler(scheduler_file=fn)
        s.start(8008)
        w = Nanny(scheduler_file=fn)
        yield w._start()
        assert set(s.workers) == {w.worker_address}
        yield w._close()
        s.stop()
Exemple #10
0
def test_wait_for_scheduler():
    with captured_logger('distributed') as log:
        w = Nanny('127.0.0.1:44737')
        w._start()
        yield gen.sleep(6)

    log = log.getvalue()
    assert 'error' not in log.lower(), log
    assert 'restart' not in log.lower(), log
Exemple #11
0
    def create_and_destroy_worker(delay):
        start = time()
        while time() < start + 5:
            n = Nanny(s.address, ncores=2, loop=s.loop)
            n.start(0)

            yield gen.sleep(delay)

            yield n._close()
            print("Killed nanny")
Exemple #12
0
def test_run(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()

    nn = rpc(n.address)

    response = yield nn.run(function=dumps(lambda: 1))
    assert response['status'] == 'OK'
    assert loads(response['result']) == 1
Exemple #13
0
def test_worker_uses_same_host_as_nanny(c, s):
    for host in ['tcp://0.0.0.0', 'tcp://127.0.0.2']:
        n = Nanny(s.address)
        yield n._start(host)

        def func(dask_worker):
            return dask_worker.listener.listen_address

        result = yield c.run(func)
        assert host in first(result.values())
        yield n._close()
Exemple #14
0
def test_run(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()

    with rpc(n.address) as nn:
        response = yield nn.run(function=dumps(lambda: 1))
        assert response['status'] == 'OK'
        assert response['result'] == 1

    yield n._close()
Exemple #15
0
def test_scheduler_address_config(c, s):
    with dask.config.set({'scheduler-address': s.address}):
        nanny = Nanny(loop=s.loop)
        yield nanny._start()
        assert nanny.scheduler.address == s.address

        start = time()
        while not s.workers:
            yield gen.sleep(0.1)
            assert time() < start + 10

    yield nanny._close()
Exemple #16
0
def run_nanny(q, center_port, **kwargs):
    from distributed import Nanny
    from tornado.ioloop import IOLoop, PeriodicCallback
    import logging
    with log_errors():
        IOLoop.clear_instance()
        loop = IOLoop(); loop.make_current()
        PeriodicCallback(lambda: None, 500).start()
        logging.getLogger("tornado").setLevel(logging.CRITICAL)
        worker = Nanny('127.0.0.1', center_port, ip='127.0.0.1', **kwargs)
        loop.run_sync(lambda: worker._start(0))
        q.put(worker.port)
        loop.start()
Exemple #17
0
def run_nanny(q, scheduler_q, **kwargs):
    from distributed import Nanny

    with log_errors():
        with pristine_loop() as loop:
            scheduler_addr = scheduler_q.get()
            worker = Nanny(scheduler_addr, validate=True, **kwargs)
            loop.run_sync(lambda: worker._start(0))
            q.put(worker.address)
            try:
                loop.start()
            finally:
                loop.run_sync(worker._close)
                loop.close(all_fds=True)
Exemple #18
0
def test_nanny(s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)

    yield n._start(0)
    with rpc(n.address) as nn:
        assert n.is_alive()
        assert s.ncores[n.worker_address] == 2
        assert s.workers[n.worker_address].services['nanny'] > 1024

        yield nn.kill()
        assert not n.is_alive()
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.workers

        yield nn.kill()
        assert not n.is_alive()
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.workers

        yield nn.instantiate()
        assert n.is_alive()
        assert s.ncores[n.worker_address] == 2
        assert s.workers[n.worker_address].services['nanny'] > 1024

        yield nn.terminate()
        assert not n.is_alive()

    yield n._close()
def test_submit_after_failed_worker_async(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)
    while len(s.workers) < 3:
        yield gen.sleep(0.1)

    L = c.map(inc, range(10))
    yield wait(L)

    s.loop.add_callback(n.kill)
    total = c.submit(sum, L)
    result = yield total
    assert result == sum(map(inc, range(10)))

    yield n._close()
Exemple #20
0
def run_nanny(q, scheduler_port, **kwargs):
    from distributed import Nanny
    from tornado.ioloop import IOLoop, PeriodicCallback
    with log_errors():
        IOLoop.clear_instance()
        loop = IOLoop(); loop.make_current()
        PeriodicCallback(lambda: None, 500).start()
        worker = Nanny('127.0.0.1', scheduler_port, ip='127.0.0.1',
                       loop=loop, validate=True, **kwargs)
        loop.run_sync(lambda: worker._start(0))
        q.put(worker.port)
        try:
            loop.start()
        finally:
            loop.run_sync(worker._close)
            loop.close(all_fds=True)
Exemple #21
0
def test_avoid_memory_monitor_if_zero_limit(c, s):
    nanny = Nanny(s.address, loop=s.loop, memory_limit=0)
    yield nanny._start()
    typ = yield c.run(lambda dask_worker: type(dask_worker.data))
    assert typ == {nanny.worker_address: dict}
    pcs = yield c.run(lambda dask_worker: list(dask_worker.periodic_callbacks))
    assert 'memory' not in pcs
    assert 'memory' not in nanny.periodic_callbacks

    future = c.submit(inc, 1)
    assert (yield future) == 2
    yield gen.sleep(0.02)

    yield c.submit(inc, 2)  # worker doesn't pause

    yield nanny._close()
Exemple #22
0
def test_nanny_process_failure():
    c = Center('127.0.0.1')
    c.listen(0)
    n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit),
                         args=dumps((0,)),
                         key='z')

    start = time()
    while n.process.is_alive():  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while not n.process.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while n.worker_address not in c.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 2

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    c.stop()
Exemple #23
0
def test_environment_variable(c, s):
    a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"})
    b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"})
    yield [a, b]
    results = yield c.run(lambda: os.environ["FOO"])
    assert results == {a.worker_address: "123", b.worker_address: "456"}
    yield [a.close(), b.close()]
Exemple #24
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_address = n.worker_address
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    pid = n.pid
    assert pid is not None
    with ignoring(CommClosedError):
        yield c._run(os._exit, 0, workers=[n.worker_address])

    start = time()
    while n.pid == pid:  # wait while process dies and comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    # assert n.worker_address != original_address  # most likely

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Exemple #25
0
def test_many_kills(s):
    n = Nanny(s.address, ncores=2, loop=s.loop)
    yield n._start(0)
    assert n.is_alive()
    yield [n.kill() for i in range(5)]
    yield [n.kill() for i in range(5)]
    yield n._close()
Exemple #26
0
def test_environment_variable(c, s):
    a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"})
    b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"})
    yield [a._start(), b._start()]
    results = yield c.run(lambda: os.environ['FOO'])
    assert results == {a.worker_address: "123", b.worker_address: "456"}
    yield [a._close(), b._close()]
Exemple #27
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_address = n.worker_address
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    pid = n.pid
    assert pid is not None
    with ignoring(CommClosedError):
        yield c._run(os._exit, 0, workers=[n.worker_address])

    start = time()
    while n.pid == pid:  # wait while process dies and comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    # assert n.worker_address != original_address  # most likely

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Exemple #28
0
async def test_environment_variable(c, s):
    a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"})
    b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"})
    await asyncio.gather(a, b)
    results = await c.run(lambda: os.environ["FOO"])
    assert results == {a.worker_address: "123", b.worker_address: "456"}
    await asyncio.gather(a.close(), b.close())
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.address, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    N = 256
    expected_result = N * (N + 1) // 2
    i = 0
    L = c.map(inc, range(N), key=["inc-%d-%d" % (i, j) for j in range(N)])
    while len(L) > 1:
        i += 1
        L = c.map(
            slowadd,
            *zip(*partition_all(2, L)),
            key=["add-%d-%d" % (i, j) for j in range(len(L) // 2)]
        )

    yield gen.sleep(random.random() / 20)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])

    yield gen.sleep(random.random() / 20)
    while len(s.workers) < 3:
        yield gen.sleep(0.01)

    with ignoring(
        CommClosedError, EnvironmentError
    ):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    [result] = yield c.gather(L)
    assert isinstance(result, int)
    assert result == expected_result

    yield n.close()
Exemple #30
0
def test_monitor_resources(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)

    yield n._start()
    assert isalive(n.process)
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert 'timestamp' in d

    stream = yield connect(ip=n.ip, port=n.port)
    yield write(stream, {'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield read(stream)
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    close(stream)
    yield n._close()
    s.stop()
Exemple #31
0
def test_nanny_process_failure(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit),
                         args=dumps((0,)),
                         key='z')

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.process.poll() is None:  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    s.stop()
Exemple #32
0
def run_nanny(q, scheduler_port, **kwargs):
    from distributed import Nanny
    from tornado.ioloop import IOLoop, PeriodicCallback
    import logging
    with log_errors():
        IOLoop.clear_instance()
        loop = IOLoop()
        loop.make_current()
        PeriodicCallback(lambda: None, 500).start()
        logging.getLogger("tornado").setLevel(logging.CRITICAL)
        worker = Nanny('127.0.0.1',
                       scheduler_port,
                       ip='127.0.0.1',
                       loop=loop,
                       **kwargs)
        loop.run_sync(lambda: worker._start(0))
        q.put(worker.port)
        try:
            loop.start()
        finally:
            loop.run_sync(worker._close)
            loop.close(all_fds=True)
def test_num_fds(s):
    psutil = pytest.importorskip("psutil")
    proc = psutil.Process()

    # Warm up
    w = yield Nanny(s.address)
    yield w.close()
    del w
    gc.collect()

    before = proc.num_fds()

    for i in range(3):
        w = yield Nanny(s.address)
        yield gen.sleep(0.1)
        yield w.close()

    start = time()
    while proc.num_fds() > before:
        print("fds:", before, proc.num_fds())
        yield gen.sleep(0.1)
        assert time() < start + 10
Exemple #34
0
def test_monitor_resources(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)

    yield n._start()
    assert isalive(n.process)
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert 'timestamp' in d

    comm = yield connect(n.address)
    yield comm.write({'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield comm.read()
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    yield comm.close()
    yield n._close()
    s.stop()
Exemple #35
0
def test_file_descriptors(c, s):
    yield gen.sleep(0.1)
    psutil = pytest.importorskip("psutil")
    da = pytest.importorskip("dask.array")
    proc = psutil.Process()
    num_fds_1 = proc.num_fds()

    N = 20
    nannies = yield [Nanny(s.address, loop=s.loop) for i in range(N)]

    while len(s.nthreads) < N:
        yield gen.sleep(0.1)

    num_fds_2 = proc.num_fds()

    yield gen.sleep(0.2)

    num_fds_3 = proc.num_fds()
    assert num_fds_3 <= num_fds_2 + N  # add some heartbeats

    x = da.random.random(size=(1000, 1000), chunks=(25, 25))
    x = c.persist(x)
    yield wait(x)

    num_fds_4 = proc.num_fds()
    assert num_fds_4 <= num_fds_2 + 2 * N

    y = c.persist(x + x.T)
    yield wait(y)

    num_fds_5 = proc.num_fds()
    assert num_fds_5 < num_fds_4 + N

    yield gen.sleep(1)

    num_fds_6 = proc.num_fds()
    assert num_fds_6 < num_fds_5 + N

    yield [n.close() for n in nannies]
    yield c.close()

    assert not s.rpc.open
    for addr, occ in c.rpc.occupied.items():
        for comm in occ:
            assert comm.closed() or comm.peer_address != s.address, comm
    assert not s.stream_comms

    start = time()
    while proc.num_fds() > num_fds_1 + N:
        yield gen.sleep(0.01)
        assert time() < start + 3
async def f():
    async with Scheduler(protocol=protocol,
                         interface='ib0',
                         dashboard_address=':8789') as s:
        async with Nanny(
                s.address,
                protocol=protocol,
                nthreads=1,
                memory_limit='32GB',
                env={'CUDA_VISIBLE_DEVICES': '2'},
        ) as w:
            async with Nanny(s.address,
                             protocol=protocol,
                             memory_limit='32gb',
                             env={'CUDA_VISIBLE_DEVICES': '3'},
                             nthreads=1) as w2:
                async with Client(s.address, asynchronous=True) as c:
                    with log_errors(pdb=True):
                        # Create a simple random array
                        N = 500000000

                        chunks = N // 100
                        rs = da.random.RandomState(
                            RandomState=cupy.random.RandomState)
                        x = rs.random(N, chunks=chunks)
                        y = rs.random(N, chunks=chunks)

                        def f(x):
                            return cudf.from_dlpack(x.toDlpack())

                        res_x = x.map_blocks(f, meta=cudf.Series())
                        res_y = y.map_blocks(f, meta=cudf.Series())
                        res = (res_x + res_y).persist()
                        await res
                        x.rechunk(chunks=N // 10).persist()
                        res = await c.compute(x.sum())
                        print(res)
                        print("FINISHED" * 100)
def test_multiple_executors_restart(loop):
    from distributed import Nanny, rpc
    c = Center('127.0.0.1')
    c.listen(0)
    a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    @gen.coroutine
    def f():
        yield a._start()
        yield b._start()
        while len(c.ncores) < 2:
            yield gen.sleep(0.01)

        try:
            e1 = Executor((c.ip, c.port), start=False, loop=loop)
            yield e1._start()
            e2 = Executor(e1.scheduler, start=False, loop=loop)
            yield e2._start()

            x = e1.submit(inc, 1)
            y = e2.submit(inc, 2)
            xx = yield x._result()
            yy = yield y._result()
            assert xx == 2
            assert yy == 3

            yield e1._restart()

            assert x.cancelled()
            assert y.cancelled()
        finally:
            yield a._close()
            yield b._close()
            yield e1._shutdown(fast=True)
            yield e2._shutdown(fast=True)
            c.stop()

    loop.run_sync(f)
def test_submit_after_failed_worker_async(c, s, a, b):
    n = yield Nanny(s.address, nthreads=2, loop=s.loop)
    while len(s.workers) < 3:
        yield gen.sleep(0.1)

    L = c.map(inc, range(10))
    yield wait(L)

    s.loop.add_callback(n.kill)
    total = c.submit(sum, L)
    result = yield total
    assert result == sum(map(inc, range(10)))

    yield n.close()
Exemple #39
0
def test_broken_worker_during_computation(e, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = e.map(inc, range(256))
    for i in range(8):
        L = e.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    n.process.terminate()
    yield gen.sleep(random() / 2)
    n.process.terminate()

    result = yield e._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Exemple #40
0
def test_monitor_resources(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)

    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    assert n.process.is_alive()
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert 'timestamp' in d

    stream = yield connect(ip=n.ip, port=n.port)
    yield write(stream, {'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield read(stream)
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    stream.close()
    yield n._close()
    s.stop()
Exemple #41
0
async def test_nanny_port_range(c, s):
    nanny_port = "9867:9868"
    worker_port = "9869:9870"
    async with Nanny(s.address, port=nanny_port,
                     worker_port=worker_port) as n1:
        assert n1.port == 9867  # Selects first port in range
        async with Nanny(s.address, port=nanny_port,
                         worker_port=worker_port) as n2:
            assert n2.port == 9868  # Selects next port in range
            with pytest.raises(
                    ValueError,
                    match="Could not start Nanny"):  # No more ports left
                async with Nanny(s.address,
                                 port=nanny_port,
                                 worker_port=worker_port):
                    pass

            # Ensure Worker ports are in worker_port range
            def get_worker_port(dask_worker):
                return dask_worker.port

            worker_ports = await c.run(get_worker_port)
            assert list(worker_ports.values()) == parse_ports(worker_port)
Exemple #42
0
def test_file_descriptors(c, s):
    yield gen.sleep(0.1)
    psutil = pytest.importorskip('psutil')
    da = pytest.importorskip('dask.array')
    proc = psutil.Process()
    num_fds_1 = proc.num_fds()

    N = 20
    nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)]
    yield [n._start() for n in nannies]

    while len(s.ncores) < N:
        yield gen.sleep(0.1)

    num_fds_2 = proc.num_fds()

    yield gen.sleep(0.2)

    num_fds_3 = proc.num_fds()
    assert num_fds_3 <= num_fds_2 + N  # add some heartbeats

    x = da.random.random(size=(1000, 1000), chunks=(25, 25))
    x = c.persist(x)
    yield wait(x)

    num_fds_4 = proc.num_fds()
    assert num_fds_4 <= num_fds_2 + 2 * N

    y = c.persist(x + x.T)
    yield wait(y)

    num_fds_5 = proc.num_fds()
    assert num_fds_5 < num_fds_4 + N

    yield gen.sleep(1)

    num_fds_6 = proc.num_fds()
    assert num_fds_6 < num_fds_5 + N

    yield [n._close() for n in nannies]

    assert not s.rpc.open
    assert not c.rpc.open
    assert not s.stream_comms

    start = time()
    while proc.num_fds() > num_fds_1 + N:
        yield gen.sleep(0.01)
        assert time() < start + 3
def test_avoid_memory_monitor_if_zero_limit(c, s):
    nanny = yield Nanny(s.address, loop=s.loop, memory_limit=0)
    typ = yield c.run(lambda dask_worker: type(dask_worker.data))
    assert typ == {nanny.worker_address: dict}
    pcs = yield c.run(lambda dask_worker: list(dask_worker.periodic_callbacks))
    assert "memory" not in pcs
    assert "memory" not in nanny.periodic_callbacks

    future = c.submit(inc, 1)
    assert (yield future) == 2
    yield gen.sleep(0.02)

    yield c.submit(inc, 2)  # worker doesn't pause

    yield nanny.close()
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    N = 256
    expected_result = N * (N + 1) // 2
    i = 0
    L = c.map(inc, range(N),
              key=['inc-%d-%d' % (i, j) for j in range(N)])
    while len(L) > 1:
        i += 1
        L = c.map(slowadd, *zip(*partition_all(2, L)),
                  key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)])

    yield gen.sleep(random.random() / 20)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])

    yield gen.sleep(random.random() / 20)
    while len(s.workers) < 3:
        yield gen.sleep(0.01)

    with ignoring(CommClosedError, EnvironmentError):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    [result] = yield c.gather(L)
    assert isinstance(result, int)
    assert result == expected_result

    yield n._close()
Exemple #45
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield c._run(sys.exit, 0, workers=[n.worker_address])

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Exemple #46
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield c._run(sys.exit, 0, workers=[n.worker_address])

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Exemple #47
0
async def test_preload_import_time(cleanup):
    text = """
from distributed.comm.registry import backends
from distributed.comm.tcp import TCPBackend

backends["foo"] = TCPBackend()
""".strip()
    try:
        async with Scheduler(port=0, preload=text, protocol="foo") as s:
            async with Nanny(s.address, preload=text, protocol="foo") as n:
                async with Client(s.address, asynchronous=True) as c:
                    await c.wait_for_workers(1)
    finally:
        from distributed.comm.registry import backends

        del backends["foo"]
Exemple #48
0
def start_worker(nthreads=None, memory_limit=None):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if memory_limit is None:
        memory_limit = int(skein.properties.container_resources.memory * 1e6)
    if nthreads is None:
        nthreads = skein.properties.container_resources.vcores

    app_client = skein.ApplicationClient.from_current()

    scheduler = app_client.kv.wait('dask.scheduler').decode()

    loop = IOLoop.current()

    # Until the config patch is merged, we can't use the nanny process since
    # there's no way to monkey patch config inside the forkserver process
    if hasattr(dask.config, 'PATH'):
        worker = Nanny(scheduler,
                       ncores=nthreads,
                       loop=loop,
                       memory_limit=memory_limit,
                       worker_port=0)

        @gen.coroutine
        def close(signalnum):
            worker._close(timeout=2)

        install_signal_handlers(loop, cleanup=close)
    else:
        worker = Worker(scheduler,
                        ncores=nthreads,
                        loop=loop,
                        memory_limit=memory_limit)

    @gen.coroutine
    def run():
        yield worker._start(None)
        while worker.status != 'closed':
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
Exemple #49
0
async def test_worker_preload_config(cleanup):
    text = """
def dask_setup(worker):
    worker.foo = 'setup'

def dask_teardown(worker):
    worker.foo = 'teardown'
"""
    with dask.config.set(
        {"distributed.worker.preload": text, "distributed.nanny.preload": text}
    ):
        async with Scheduler(port=0) as s:
            async with Nanny(s.address) as w:
                assert w.foo == "setup"
                async with Client(s.address, asynchronous=True) as c:
                    d = await c.run(lambda dask_worker: dask_worker.foo)
                    assert d == {w.worker_address: "setup"}
            assert w.foo == "teardown"
def test_restart():
    from distributed import Nanny, rpc
    c = Center('127.0.0.1')
    c.listen(0)
    a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')

    yield [a._start(), b._start()]

    e = Executor((c.ip, c.port), start=False, loop=IOLoop.current())
    yield e._start()

    assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2}

    x = e.submit(inc, 1)
    y = e.submit(inc, x)
    yield y._result()

    cc = rpc(ip=c.ip, port=c.port)
    who_has = yield cc.who_has()
    try:
        assert e.scheduler.who_has == who_has
        assert set(e.scheduler.who_has) == {x.key, y.key}

        f = yield e._restart()
        assert f is e

        assert len(e.scheduler.stacks) == 2
        assert len(e.scheduler.processing) == 2

        who_has = yield cc.who_has()
        assert not who_has
        assert not e.scheduler.who_has

        assert x.cancelled()
        assert y.cancelled()

    finally:
        yield a._close()
        yield b._close()
        yield e._shutdown(fast=True)
        c.stop()
Exemple #51
0
def test_monitor_resources():
    pytest.importorskip('psutil')
    c = Center('127.0.0.1')
    c.listen(0)
    a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3)

    yield a._start()
    yield b._start()
    yield s.sync_center()
    done = s.start()

    try:
        assert s.ncores == {
            ('127.0.0.1', a.worker_port): 2,
            ('127.0.0.1', b.worker_port): 2
        }
        assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]}

        while any(len(v) < 3 for v in s.resource_logs.values()):
            yield gen.sleep(0.01)

        yield gen.sleep(0.1)

        assert set(s.resource_logs) == {a.address, b.address}
        assert all(len(v) == 3 for v in s.resource_logs.values())

        d = s.diagnostic_resources(n=2)
        assert set(d) == {a.worker_address, b.worker_address}
        assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'})
        assert all(len(v) == 2 for v in d[a.worker_address].values())

        s.put({'op': 'close'})
        yield done
    finally:
        with ignoring(TimeoutError, StreamClosedError, OSError):
            yield a._close(timeout=0.5)
        with ignoring(TimeoutError, StreamClosedError, OSError):
            yield b._close(timeout=0.5)
        c.stop()
Exemple #52
0
async def test_stress_creation_and_deletion(c, s):
    # Assertions are handled by the validate mechanism in the scheduler
    da = pytest.importorskip("dask.array")

    rng = da.random.RandomState(0)
    x = rng.random(size=(2000, 2000), chunks=(100, 100))
    y = ((x + 1).T + (x * 2) - x.mean(axis=1)).sum().round(2)
    z = c.persist(y)

    async def create_and_destroy_worker(delay):
        start = time()
        while time() < start + 5:
            async with Nanny(s.address, nthreads=2) as n:
                await asyncio.sleep(delay)
            print("Killed nanny")

    await asyncio.gather(*(create_and_destroy_worker(0.1 * i) for i in range(20)))

    async with Nanny(s.address, nthreads=2):
        assert await c.compute(z) == 8000884.93
Exemple #53
0
def test_file_descriptors(c, s):
    psutil = pytest.importorskip('psutil')
    da = pytest.importorskip('dask.array')
    proc = psutil.Process()
    num_fds_1 = proc.num_fds()

    N = 20
    nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)]
    yield [n._start() for n in nannies]

    while len(s.ncores) < N:
        yield gen.sleep(0.1)

    num_fds_2 = proc.num_fds()

    yield gen.sleep(0.2)

    num_fds_3 = proc.num_fds()
    assert num_fds_3 == num_fds_2

    x = da.random.normal(10, 1, size=(1000, 1000), chunks=(10, 10))
    x = c.persist(x)
    yield _wait(x)

    num_fds_4 = proc.num_fds()
    assert num_fds_4 < num_fds_3 + N

    y = c.persist(x + x.T)
    yield _wait(y)

    num_fds_5 = proc.num_fds()
    assert num_fds_5 < num_fds_4 + N

    yield gen.sleep(1)

    num_fds_6 = proc.num_fds()
    assert num_fds_6 < num_fds_5 + N

    yield [n._close() for n in nannies]
Exemple #54
0
def worker(nthreads=None, memory_limit=None):  # pragma: nocover
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if memory_limit is None:
        memory_limit = int(skein.properties.container_resources.memory * 2**20)
    if nthreads is None:
        nthreads = skein.properties.container_resources.vcores

    app_client = skein.ApplicationClient.from_current()

    scheduler = app_client.kv.wait("dask.scheduler").decode()

    loop = IOLoop.current()

    worker = Nanny(
        scheduler,
        loop=loop,
        memory_limit=memory_limit,
        worker_port=0,
        nthreads=nthreads,
        name=skein.properties.container_id,
    )

    async def cleanup():
        await worker.close(timeout=2)

    install_signal_handlers(loop, cleanup=cleanup)

    async def run():
        await worker
        await worker.finished()

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
Exemple #55
0
def worker(nthreads=None, memory_limit=None):  # pragma: nocover
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if memory_limit is None:
        memory_limit = int(skein.properties.container_resources.memory * 2**20)
    if nthreads is None:
        nthreads = skein.properties.container_resources.vcores

    app_client = skein.ApplicationClient.from_current()

    scheduler = app_client.kv.wait('dask.scheduler').decode()

    loop = IOLoop.current()

    worker = Nanny(scheduler,
                   loop=loop,
                   memory_limit=memory_limit,
                   worker_port=0,
                   **{_NTHREADS_KEYWORD: nthreads})

    @gen.coroutine
    def close(signalnum):
        worker._close(timeout=2)

    install_signal_handlers(loop, cleanup=close)

    @gen.coroutine
    def run():
        yield worker._start(None)
        while worker.status != 'closed':
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
Exemple #56
0
async def test_nanny(s):
    async with Nanny(s.address, nthreads=2, loop=s.loop) as n:
        async with rpc(n.address) as nn:
            assert n.is_alive()
            assert s.nthreads[n.worker_address] == 2
            assert s.workers[n.worker_address].nanny == n.address

            await nn.kill()
            assert not n.is_alive()
            assert n.worker_address not in s.nthreads
            assert n.worker_address not in s.workers

            await nn.kill()
            assert not n.is_alive()
            assert n.worker_address not in s.nthreads
            assert n.worker_address not in s.workers

            await nn.instantiate()
            assert n.is_alive()
            assert s.nthreads[n.worker_address] == 2
            assert s.workers[n.worker_address].nanny == n.address

            await nn.terminate()
            assert not n.is_alive()
Exemple #57
0
def become_dask_worker(address, nanny=False, **kwargs):
    """Task function for becoming a dask.distributed Worker

    Parameters
    ----------

    address: str
        The URL of the dask Scheduler.
    **kwargs:
        Any additional keyword arguments will be passed to the Worker constructor.
    """
    shell = get_ipython()
    kernel = shell.kernel
    if getattr(kernel, 'dask_worker', None) is not None:
        kernel.log.info("Dask worker is already running.")
        return
    from distributed import Worker, Nanny
    if nanny:
        w = Nanny(address, **kwargs)
    else:
        w = Worker(address, **kwargs)
    shell.user_ns['dask_worker'] = shell.user_ns[
        'distributed_worker'] = kernel.distributed_worker = w
    kernel.io_loop.add_callback(w.start)
Exemple #58
0
def test_restart():
    from distributed import Nanny, rpc
    c = Center('127.0.0.1')
    c.listen(0)
    a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')

    yield [a._start(), b._start()]

    e = Executor((c.ip, c.port), start=False, loop=IOLoop.current())
    yield e._start()

    assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2}

    x = e.submit(inc, 1)
    y = e.submit(inc, x)
    yield y._result()

    cc = rpc(ip=c.ip, port=c.port)
    who_has = yield cc.who_has()
    try:
        assert e.scheduler.who_has == who_has
        assert set(e.scheduler.who_has) == {x.key, y.key}

        f = yield e._restart()
        assert f is e

        assert len(e.scheduler.stacks) == 2
        assert len(e.scheduler.processing) == 2

        who_has = yield cc.who_has()
        assert not who_has
        assert not e.scheduler.who_has

        assert x.cancelled()
        assert y.cancelled()

    finally:
        yield a._close()
        yield b._close()
        yield e._shutdown(fast=True)
        c.stop()
def test_monitor_resources():
    pytest.importorskip('psutil')
    c = Center('127.0.0.1')
    c.listen(0)
    a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3)

    yield a._start()
    yield b._start()
    yield s.sync_center()
    done = s.start()

    try:
        assert s.ncores == {('127.0.0.1', a.worker_port): 2,
                            ('127.0.0.1', b.worker_port): 2}
        assert s.nannies == {(n.ip, n.worker_port): n.port
                             for n in [a, b]}

        while any(len(v) < 3 for v in s.resource_logs.values()):
            yield gen.sleep(0.01)

        yield gen.sleep(0.1)

        assert set(s.resource_logs) == {a.address, b.address}
        assert all(len(v) == 3 for v in s.resource_logs.values())

        d = s.diagnostic_resources(n=2)
        assert set(d) == {a.worker_address, b.worker_address}
        assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'})
        assert all(len(v) == 2 for v in d[a.worker_address].values())

        s.put({'op': 'close'})
        yield done
    finally:
        with ignoring(TimeoutError, StreamClosedError, OSError):
            yield a._close(timeout=0.5)
        with ignoring(TimeoutError, StreamClosedError, OSError):
            yield b._close(timeout=0.5)
        c.stop()