def test_monitor_resources(): pytest.importorskip('psutil') c = Center(ip='127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert isinstance(d['timestamp'], datetime) stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) stream.close() yield n._close() c.stop()
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start(0) with rpc(ip=n.ip, port=n.port) as nn: assert isalive(n.process) # alive assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert not n.process assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.instantiate() assert isalive(n.process) assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.process yield n._close()
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01, key=['f%d' % i for i in range(20)]) yield wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[n_worker_address]) while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n._close()
def test_nanny_death_timeout(s): yield s.close() w = Nanny(s.address, death_timeout=1) yield w._start() yield gen.sleep(3) assert w.status == 'closed'
def test_nanny(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start(0) nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.instantiate() assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.terminate() assert not n.process if n.process: n.process.terminate() yield n._close() c.stop()
def test_many_kills(s): n = Nanny(s.address, ncores=2, loop=s.loop) yield n._start(0) assert n.is_alive() yield [n.kill() for i in range(5)] yield [n.kill() for i in range(5)] yield n._close()
def become_distributed_worker(ip, port, nanny=False, **kwargs): """Task function for becoming a distributed Worker Parameters ---------- ip: str The IP address of the Scheduler. port: int The port of the Scheduler. **kwargs: Any additional keyword arguments will be passed to the Worker constructor. """ shell = get_ipython() kernel = shell.kernel if getattr(kernel, 'distributed_worker', None) is not None: kernel.log.info("Distributed worker is already running.") return from distributed import Worker, Nanny if nanny: w = Nanny(ip, port, **kwargs) else: w = Worker(ip, port, **kwargs) shell.user_ns['distributed_worker'] = kernel.distributed_worker = w w.start(0)
def test_broken_worker_during_computation(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def test_scheduler_file(): with tmpfile() as fn: s = Scheduler(scheduler_file=fn) s.start(8008) w = Nanny(scheduler_file=fn) yield w._start() assert set(s.workers) == {w.worker_address} yield w._close() s.stop()
def test_wait_for_scheduler(): with captured_logger('distributed') as log: w = Nanny('127.0.0.1:44737') w._start() yield gen.sleep(6) log = log.getvalue() assert 'error' not in log.lower(), log assert 'restart' not in log.lower(), log
def create_and_destroy_worker(delay): start = time() while time() < start + 5: n = Nanny(s.address, ncores=2, loop=s.loop) n.start(0) yield gen.sleep(delay) yield n._close() print("Killed nanny")
def test_run(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(n.address) response = yield nn.run(function=dumps(lambda: 1)) assert response['status'] == 'OK' assert loads(response['result']) == 1
def test_worker_uses_same_host_as_nanny(c, s): for host in ['tcp://0.0.0.0', 'tcp://127.0.0.2']: n = Nanny(s.address) yield n._start(host) def func(dask_worker): return dask_worker.listener.listen_address result = yield c.run(func) assert host in first(result.values()) yield n._close()
def test_run(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() with rpc(n.address) as nn: response = yield nn.run(function=dumps(lambda: 1)) assert response['status'] == 'OK' assert response['result'] == 1 yield n._close()
def test_scheduler_address_config(c, s): with dask.config.set({'scheduler-address': s.address}): nanny = Nanny(loop=s.loop) yield nanny._start() assert nanny.scheduler.address == s.address start = time() while not s.workers: yield gen.sleep(0.1) assert time() < start + 10 yield nanny._close()
def run_nanny(q, center_port, **kwargs): from distributed import Nanny from tornado.ioloop import IOLoop, PeriodicCallback import logging with log_errors(): IOLoop.clear_instance() loop = IOLoop(); loop.make_current() PeriodicCallback(lambda: None, 500).start() logging.getLogger("tornado").setLevel(logging.CRITICAL) worker = Nanny('127.0.0.1', center_port, ip='127.0.0.1', **kwargs) loop.run_sync(lambda: worker._start(0)) q.put(worker.port) loop.start()
def run_nanny(q, scheduler_q, **kwargs): from distributed import Nanny with log_errors(): with pristine_loop() as loop: scheduler_addr = scheduler_q.get() worker = Nanny(scheduler_addr, validate=True, **kwargs) loop.run_sync(lambda: worker._start(0)) q.put(worker.address) try: loop.start() finally: loop.run_sync(worker._close) loop.close(all_fds=True)
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start(0) with rpc(n.address) as nn: assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.workers[n.worker_address].services['nanny'] > 1024 yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.workers yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.workers yield nn.instantiate() assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.workers[n.worker_address].services['nanny'] > 1024 yield nn.terminate() assert not n.is_alive() yield n._close()
def test_submit_after_failed_worker_async(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) while len(s.workers) < 3: yield gen.sleep(0.1) L = c.map(inc, range(10)) yield wait(L) s.loop.add_callback(n.kill) total = c.submit(sum, L) result = yield total assert result == sum(map(inc, range(10))) yield n._close()
def run_nanny(q, scheduler_port, **kwargs): from distributed import Nanny from tornado.ioloop import IOLoop, PeriodicCallback with log_errors(): IOLoop.clear_instance() loop = IOLoop(); loop.make_current() PeriodicCallback(lambda: None, 500).start() worker = Nanny('127.0.0.1', scheduler_port, ip='127.0.0.1', loop=loop, validate=True, **kwargs) loop.run_sync(lambda: worker._start(0)) q.put(worker.port) try: loop.start() finally: loop.run_sync(worker._close) loop.close(all_fds=True)
def test_avoid_memory_monitor_if_zero_limit(c, s): nanny = Nanny(s.address, loop=s.loop, memory_limit=0) yield nanny._start() typ = yield c.run(lambda dask_worker: type(dask_worker.data)) assert typ == {nanny.worker_address: dict} pcs = yield c.run(lambda dask_worker: list(dask_worker.periodic_callbacks)) assert 'memory' not in pcs assert 'memory' not in nanny.periodic_callbacks future = c.submit(inc, 1) assert (yield future) == 2 yield gen.sleep(0.02) yield c.submit(inc, 2) # worker doesn't pause yield nanny._close()
def test_nanny_process_failure(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() c.stop()
def test_environment_variable(c, s): a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"}) b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"}) yield [a, b] results = yield c.run(lambda: os.environ["FOO"]) assert results == {a.worker_address: "123", b.worker_address: "456"} yield [a.close(), b.close()]
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_address = n.worker_address ww = rpc(n.worker_address) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) pid = n.pid assert pid is not None with ignoring(CommClosedError): yield c._run(os._exit, 0, workers=[n.worker_address]) start = time() while n.pid == pid: # wait while process dies and comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 # assert n.worker_address != original_address # most likely start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def test_environment_variable(c, s): a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"}) b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"}) yield [a._start(), b._start()] results = yield c.run(lambda: os.environ['FOO']) assert results == {a.worker_address: "123", b.worker_address: "456"} yield [a._close(), b._close()]
async def test_environment_variable(c, s): a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"}) b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"}) await asyncio.gather(a, b) results = await c.run(lambda: os.environ["FOO"]) assert results == {a.worker_address: "123", b.worker_address: "456"} await asyncio.gather(a.close(), b.close())
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.address, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=["inc-%d-%d" % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map( slowadd, *zip(*partition_all(2, L)), key=["add-%d-%d" % (i, j) for j in range(len(L) // 2)] ) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring( CommClosedError, EnvironmentError ): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n.close()
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) close(stream) yield n._close() s.stop()
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.process.poll() is None: # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() s.stop()
def run_nanny(q, scheduler_port, **kwargs): from distributed import Nanny from tornado.ioloop import IOLoop, PeriodicCallback import logging with log_errors(): IOLoop.clear_instance() loop = IOLoop() loop.make_current() PeriodicCallback(lambda: None, 500).start() logging.getLogger("tornado").setLevel(logging.CRITICAL) worker = Nanny('127.0.0.1', scheduler_port, ip='127.0.0.1', loop=loop, **kwargs) loop.run_sync(lambda: worker._start(0)) q.put(worker.port) try: loop.start() finally: loop.run_sync(worker._close) loop.close(all_fds=True)
def test_num_fds(s): psutil = pytest.importorskip("psutil") proc = psutil.Process() # Warm up w = yield Nanny(s.address) yield w.close() del w gc.collect() before = proc.num_fds() for i in range(3): w = yield Nanny(s.address) yield gen.sleep(0.1) yield w.close() start = time() while proc.num_fds() > before: print("fds:", before, proc.num_fds()) yield gen.sleep(0.1) assert time() < start + 10
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d comm = yield connect(n.address) yield comm.write({'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield comm.read() assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) yield comm.close() yield n._close() s.stop()
def test_file_descriptors(c, s): yield gen.sleep(0.1) psutil = pytest.importorskip("psutil") da = pytest.importorskip("dask.array") proc = psutil.Process() num_fds_1 = proc.num_fds() N = 20 nannies = yield [Nanny(s.address, loop=s.loop) for i in range(N)] while len(s.nthreads) < N: yield gen.sleep(0.1) num_fds_2 = proc.num_fds() yield gen.sleep(0.2) num_fds_3 = proc.num_fds() assert num_fds_3 <= num_fds_2 + N # add some heartbeats x = da.random.random(size=(1000, 1000), chunks=(25, 25)) x = c.persist(x) yield wait(x) num_fds_4 = proc.num_fds() assert num_fds_4 <= num_fds_2 + 2 * N y = c.persist(x + x.T) yield wait(y) num_fds_5 = proc.num_fds() assert num_fds_5 < num_fds_4 + N yield gen.sleep(1) num_fds_6 = proc.num_fds() assert num_fds_6 < num_fds_5 + N yield [n.close() for n in nannies] yield c.close() assert not s.rpc.open for addr, occ in c.rpc.occupied.items(): for comm in occ: assert comm.closed() or comm.peer_address != s.address, comm assert not s.stream_comms start = time() while proc.num_fds() > num_fds_1 + N: yield gen.sleep(0.01) assert time() < start + 3
async def f(): async with Scheduler(protocol=protocol, interface='ib0', dashboard_address=':8789') as s: async with Nanny( s.address, protocol=protocol, nthreads=1, memory_limit='32GB', env={'CUDA_VISIBLE_DEVICES': '2'}, ) as w: async with Nanny(s.address, protocol=protocol, memory_limit='32gb', env={'CUDA_VISIBLE_DEVICES': '3'}, nthreads=1) as w2: async with Client(s.address, asynchronous=True) as c: with log_errors(pdb=True): # Create a simple random array N = 500000000 chunks = N // 100 rs = da.random.RandomState( RandomState=cupy.random.RandomState) x = rs.random(N, chunks=chunks) y = rs.random(N, chunks=chunks) def f(x): return cudf.from_dlpack(x.toDlpack()) res_x = x.map_blocks(f, meta=cudf.Series()) res_y = y.map_blocks(f, meta=cudf.Series()) res = (res_x + res_y).persist() await res x.rechunk(chunks=N // 10).persist() res = await c.compute(x.sum()) print(res) print("FINISHED" * 100)
def test_multiple_executors_restart(loop): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') @gen.coroutine def f(): yield a._start() yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: e1 = Executor((c.ip, c.port), start=False, loop=loop) yield e1._start() e2 = Executor(e1.scheduler, start=False, loop=loop) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True) c.stop() loop.run_sync(f)
def test_submit_after_failed_worker_async(c, s, a, b): n = yield Nanny(s.address, nthreads=2, loop=s.loop) while len(s.workers) < 3: yield gen.sleep(0.1) L = c.map(inc, range(10)) yield wait(L) s.loop.add_callback(n.kill) total = c.submit(sum, L) result = yield total assert result == sum(map(inc, range(10))) yield n.close()
def test_broken_worker_during_computation(e, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = e.map(inc, range(256)) for i in range(8): L = e.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) n.process.terminate() yield gen.sleep(random() / 2) n.process.terminate() result = yield e._gather(L) assert isinstance(result[0], int) yield n._close()
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) stream.close() yield n._close() s.stop()
async def test_nanny_port_range(c, s): nanny_port = "9867:9868" worker_port = "9869:9870" async with Nanny(s.address, port=nanny_port, worker_port=worker_port) as n1: assert n1.port == 9867 # Selects first port in range async with Nanny(s.address, port=nanny_port, worker_port=worker_port) as n2: assert n2.port == 9868 # Selects next port in range with pytest.raises( ValueError, match="Could not start Nanny"): # No more ports left async with Nanny(s.address, port=nanny_port, worker_port=worker_port): pass # Ensure Worker ports are in worker_port range def get_worker_port(dask_worker): return dask_worker.port worker_ports = await c.run(get_worker_port) assert list(worker_ports.values()) == parse_ports(worker_port)
def test_file_descriptors(c, s): yield gen.sleep(0.1) psutil = pytest.importorskip('psutil') da = pytest.importorskip('dask.array') proc = psutil.Process() num_fds_1 = proc.num_fds() N = 20 nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)] yield [n._start() for n in nannies] while len(s.ncores) < N: yield gen.sleep(0.1) num_fds_2 = proc.num_fds() yield gen.sleep(0.2) num_fds_3 = proc.num_fds() assert num_fds_3 <= num_fds_2 + N # add some heartbeats x = da.random.random(size=(1000, 1000), chunks=(25, 25)) x = c.persist(x) yield wait(x) num_fds_4 = proc.num_fds() assert num_fds_4 <= num_fds_2 + 2 * N y = c.persist(x + x.T) yield wait(y) num_fds_5 = proc.num_fds() assert num_fds_5 < num_fds_4 + N yield gen.sleep(1) num_fds_6 = proc.num_fds() assert num_fds_6 < num_fds_5 + N yield [n._close() for n in nannies] assert not s.rpc.open assert not c.rpc.open assert not s.stream_comms start = time() while proc.num_fds() > num_fds_1 + N: yield gen.sleep(0.01) assert time() < start + 3
def test_avoid_memory_monitor_if_zero_limit(c, s): nanny = yield Nanny(s.address, loop=s.loop, memory_limit=0) typ = yield c.run(lambda dask_worker: type(dask_worker.data)) assert typ == {nanny.worker_address: dict} pcs = yield c.run(lambda dask_worker: list(dask_worker.periodic_callbacks)) assert "memory" not in pcs assert "memory" not in nanny.periodic_callbacks future = c.submit(inc, 1) assert (yield future) == 2 yield gen.sleep(0.02) yield c.submit(inc, 2) # worker doesn't pause yield nanny.close()
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=['inc-%d-%d' % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map(slowadd, *zip(*partition_all(2, L)), key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)]) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring(CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n._close()
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield c._run(sys.exit, 0, workers=[n.worker_address]) start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
async def test_preload_import_time(cleanup): text = """ from distributed.comm.registry import backends from distributed.comm.tcp import TCPBackend backends["foo"] = TCPBackend() """.strip() try: async with Scheduler(port=0, preload=text, protocol="foo") as s: async with Nanny(s.address, preload=text, protocol="foo") as n: async with Client(s.address, asynchronous=True) as c: await c.wait_for_workers(1) finally: from distributed.comm.registry import backends del backends["foo"]
def start_worker(nthreads=None, memory_limit=None): enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 1e6) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() # Until the config patch is merged, we can't use the nanny process since # there's no way to monkey patch config inside the forkserver process if hasattr(dask.config, 'PATH'): worker = Nanny(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit, worker_port=0) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) else: worker = Worker(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
async def test_worker_preload_config(cleanup): text = """ def dask_setup(worker): worker.foo = 'setup' def dask_teardown(worker): worker.foo = 'teardown' """ with dask.config.set( {"distributed.worker.preload": text, "distributed.nanny.preload": text} ): async with Scheduler(port=0) as s: async with Nanny(s.address) as w: assert w.foo == "setup" async with Client(s.address, asynchronous=True) as c: d = await c.run(lambda dask_worker: dask_worker.foo) assert d == {w.worker_address: "setup"} assert w.foo == "teardown"
def test_restart(): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield [a._start(), b._start()] e = Executor((c.ip, c.port), start=False, loop=IOLoop.current()) yield e._start() assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2} x = e.submit(inc, 1) y = e.submit(inc, x) yield y._result() cc = rpc(ip=c.ip, port=c.port) who_has = yield cc.who_has() try: assert e.scheduler.who_has == who_has assert set(e.scheduler.who_has) == {x.key, y.key} f = yield e._restart() assert f is e assert len(e.scheduler.stacks) == 2 assert len(e.scheduler.processing) == 2 who_has = yield cc.who_has() assert not who_has assert not e.scheduler.who_has assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == { ('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2 } assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()
async def test_stress_creation_and_deletion(c, s): # Assertions are handled by the validate mechanism in the scheduler da = pytest.importorskip("dask.array") rng = da.random.RandomState(0) x = rng.random(size=(2000, 2000), chunks=(100, 100)) y = ((x + 1).T + (x * 2) - x.mean(axis=1)).sum().round(2) z = c.persist(y) async def create_and_destroy_worker(delay): start = time() while time() < start + 5: async with Nanny(s.address, nthreads=2) as n: await asyncio.sleep(delay) print("Killed nanny") await asyncio.gather(*(create_and_destroy_worker(0.1 * i) for i in range(20))) async with Nanny(s.address, nthreads=2): assert await c.compute(z) == 8000884.93
def test_file_descriptors(c, s): psutil = pytest.importorskip('psutil') da = pytest.importorskip('dask.array') proc = psutil.Process() num_fds_1 = proc.num_fds() N = 20 nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)] yield [n._start() for n in nannies] while len(s.ncores) < N: yield gen.sleep(0.1) num_fds_2 = proc.num_fds() yield gen.sleep(0.2) num_fds_3 = proc.num_fds() assert num_fds_3 == num_fds_2 x = da.random.normal(10, 1, size=(1000, 1000), chunks=(10, 10)) x = c.persist(x) yield _wait(x) num_fds_4 = proc.num_fds() assert num_fds_4 < num_fds_3 + N y = c.persist(x + x.T) yield _wait(y) num_fds_5 = proc.num_fds() assert num_fds_5 < num_fds_4 + N yield gen.sleep(1) num_fds_6 = proc.num_fds() assert num_fds_6 < num_fds_5 + N yield [n._close() for n in nannies]
def worker(nthreads=None, memory_limit=None): # pragma: nocover enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait("dask.scheduler").decode() loop = IOLoop.current() worker = Nanny( scheduler, loop=loop, memory_limit=memory_limit, worker_port=0, nthreads=nthreads, name=skein.properties.container_id, ) async def cleanup(): await worker.close(timeout=2) install_signal_handlers(loop, cleanup=cleanup) async def run(): await worker await worker.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def worker(nthreads=None, memory_limit=None): # pragma: nocover enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() worker = Nanny(scheduler, loop=loop, memory_limit=memory_limit, worker_port=0, **{_NTHREADS_KEYWORD: nthreads}) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
async def test_nanny(s): async with Nanny(s.address, nthreads=2, loop=s.loop) as n: async with rpc(n.address) as nn: assert n.is_alive() assert s.nthreads[n.worker_address] == 2 assert s.workers[n.worker_address].nanny == n.address await nn.kill() assert not n.is_alive() assert n.worker_address not in s.nthreads assert n.worker_address not in s.workers await nn.kill() assert not n.is_alive() assert n.worker_address not in s.nthreads assert n.worker_address not in s.workers await nn.instantiate() assert n.is_alive() assert s.nthreads[n.worker_address] == 2 assert s.workers[n.worker_address].nanny == n.address await nn.terminate() assert not n.is_alive()
def become_dask_worker(address, nanny=False, **kwargs): """Task function for becoming a dask.distributed Worker Parameters ---------- address: str The URL of the dask Scheduler. **kwargs: Any additional keyword arguments will be passed to the Worker constructor. """ shell = get_ipython() kernel = shell.kernel if getattr(kernel, 'dask_worker', None) is not None: kernel.log.info("Dask worker is already running.") return from distributed import Worker, Nanny if nanny: w = Nanny(address, **kwargs) else: w = Worker(address, **kwargs) shell.user_ns['dask_worker'] = shell.user_ns[ 'distributed_worker'] = kernel.distributed_worker = w kernel.io_loop.add_callback(w.start)
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == {('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2} assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()