def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start(0) with rpc(n.address) as nn: assert isalive(n.process) # alive assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert not n.process assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.instantiate() assert isalive(n.process) assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.process yield n._close()
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start(0) with rpc(ip=n.ip, port=n.port) as nn: assert isalive(n.process) # alive assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert not n.process assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.instantiate() assert isalive(n.process) assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.process yield n._close()
def test_failed_worker_without_warning(c, s, a, b): L = c.map(inc, range(10)) yield _wait(L) original_process = a.process a.process.terminate() start = time() while a.process is original_process and not isalive(a.process): yield gen.sleep(0.01) assert time() - start < 10 yield gen.sleep(0.5) start = time() while len(s.ncores) < 2: yield gen.sleep(0.01) assert time() - start < 10 yield _wait(L) L2 = c.map(inc, range(10, 20)) yield _wait(L2) assert all(len(keys) > 0 for keys in s.has_what.values()) ncores2 = s.ncores.copy() yield c._restart() L = c.map(inc, range(10)) yield _wait(L) assert all(len(keys) > 0 for keys in s.has_what.values()) assert not (set(ncores2) & set(s.ncores)) # no overlap
def test_no_reconnect(nanny, loop): with popen(['dask-worker', '127.0.0.1:8786', '--no-reconnect', nanny]) as worker: with popen(['dask-scheduler']) as sched: sleep(1) start = time() while isalive(worker): sleep(0.1) assert time() < start + 10
def test_no_reconnect(nanny, loop): with popen(['dask-scheduler', '--no-bokeh']) as sched: wait_for_port('127.0.0.1:8786') with popen(['dask-worker', '127.0.0.1:8786', '--no-reconnect', nanny, '--no-bokeh']) as worker: sleep(2) terminate_process(sched) start = time() while isalive(worker): sleep(0.1) assert time() < start + 10
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d comm = yield connect(n.address) yield comm.write({'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield comm.read() assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) yield comm.close() yield n._close() s.stop()
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) close(stream) yield n._close() s.stop()
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() s.stop()
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(n.worker_address) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(CommClosedError): yield c._run(sys.exit, 0, workers=[n.worker_address]) start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0, )), key='z') start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, temp_filename, reconnect): if nanny: port = nanny_port else: port = worker_port try: scheduler_host, scheduler_port = scheduler.split(':') scheduler_ip = socket.gethostbyname(scheduler_host) scheduler_port = int(scheduler_port) except IndexError: logger.info("Usage: dask-worker scheduler_host:scheduler_port") if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} loop = IOLoop.current() if memory_limit == 'auto': import psutil memory_limit = psutil.virtual_memory().total * 0.60 if memory_limit: memory_limit = float(memory_limit) if memory_limit < 1.0: import psutil memory_limit = psutil.virtual_memory().total * memory_limit memory_limit /= nprocs memory_limit = int(memory_limit) if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if host is not None: ip = socket.gethostbyname(host) else: # lookup the ip address of a local interface on a network that # reach the scheduler ip = get_ip(scheduler_ip, scheduler_port) nannies = [ t(scheduler_ip, scheduler_port, ncores=nthreads, ip=ip, services=services, name=name, loop=loop, memory_limit=memory_limit, reconnect=reconnect, **kwargs) for i in range(nprocs) ] for n in nannies: n.start(port) if t is Nanny: global_nannies.append(n) if temp_filename: @gen.coroutine def f(): while nannies[0].status != 'running': yield gen.sleep(0.01) import json msg = { 'port': nannies[0].port, 'local_directory': nannies[0].local_dir } with open(temp_filename, 'w') as f: json.dump(msg, f) loop.add_callback(f) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") loop.close() loop2 = IOLoop() @gen.coroutine def f(): scheduler = rpc(ip=nannies[0].scheduler.ip, port=nannies[0].scheduler.port) if nanny: yield gen.with_timeout(timedelta(seconds=2), All([ scheduler.unregister( address=n.worker_address, close=True) for n in nannies if n.process and n.worker_port ]), io_loop=loop2) loop2.run_sync(f) if nanny: for n in nannies: if isalive(n.process): n.process.terminate() if nanny: start = time() while (any(isalive(n.process) for n in nannies) and time() < start + 1): sleep(0.1) for nanny in nannies: nanny.stop()
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, temp_filename, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload): if nanny: port = nanny_port else: port = worker_port if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: services[('bokeh', bokeh_port)] = BokehWorker if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if scheduler_file: while not os.path.exists(scheduler_file): sleep(0.01) for i in range(10): try: with open(scheduler_file) as f: cfg = json.load(f) scheduler = cfg['address'] break except (ValueError, KeyError): # race with scheduler on file sleep(0.01) if not scheduler: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") nannies = [ t(scheduler, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, **kwargs) for i in range(nprocs) ] if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) for n in nannies: if host: n.start((host, port)) else: n.start(port) if t is Nanny: global_nannies.append(n) if temp_filename: @gen.coroutine def f(): while nannies[0].status != 'running': yield gen.sleep(0.01) import json msg = { 'port': nannies[0].port, 'local_directory': nannies[0].local_dir } with open(temp_filename, 'w') as f: json.dump(msg, f) loop.add_callback(f) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") loop.close() # Clean exit: unregister all workers from scheduler loop2 = IOLoop() @gen.coroutine def f(): with rpc(nannies[0].scheduler.address) as scheduler: if nanny: yield gen.with_timeout(timeout=timedelta(seconds=2), future=All([ scheduler.unregister( address=n.worker_address, close=True) for n in nannies if n.process and n.worker_address ]), io_loop=loop2) loop2.run_sync(f) if nanny: for n in nannies: if isalive(n.process): n.process.terminate() if nanny: start = time() while (any(isalive(n.process) for n in nannies) and time() < start + 1): sleep(0.1) for nanny in nannies: nanny.stop()
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, no_nanny, name, memory_limit, pid_file, temp_filename): if no_nanny: port = worker_port else: port = nanny_port try: scheduler_host, scheduler_port = scheduler.split(':') scheduler_ip = socket.gethostbyname(scheduler_host) scheduler_port = int(scheduler_port) except IndexError: logger.info("Usage: dask-worker scheduler_host:scheduler_port") if nprocs > 1 and worker_port != 0: logger.error("Failed to launch worker. You cannot use the --port argument when nprocs > 1.") exit(1) if nprocs > 1 and name: logger.error("Failed to launch worker. You cannot use the --name argument when nprocs > 1.") exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} loop = IOLoop.current() if memory_limit == 'auto': import psutil memory_limit = psutil.virtual_memory().total * 0.60 if memory_limit: memory_limit = float(memory_limit) if memory_limit < 1.0: import psutil memory_limit = psutil.virtual_memory().total * memory_limit memory_limit /= nprocs memory_limit = int(memory_limit) if no_nanny: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker else: kwargs = {'worker_port': worker_port} t = Nanny if host is not None: ip = socket.gethostbyname(host) else: # lookup the ip address of a local interface on a network that # reach the scheduler ip = get_ip(scheduler_ip, scheduler_port) nannies = [t(scheduler_ip, scheduler_port, ncores=nthreads, ip=ip, services=services, name=name, loop=loop, memory_limit=memory_limit, **kwargs) for i in range(nprocs)] for nanny in nannies: nanny.start(port) if t is Nanny: global_nannies.append(nanny) if temp_filename: @gen.coroutine def f(): while nannies[0].status != 'running': yield gen.sleep(0.01) import json msg = {'port': nannies[0].port, 'local_directory': nannies[0].local_dir} with open(temp_filename, 'w') as f: json.dump(msg, f) loop.add_callback(f) loop.start() logger.info("End worker") loop.close() loop2 = IOLoop() @gen.coroutine def f(): scheduler = rpc(ip=nannies[0].scheduler.ip, port=nannies[0].scheduler.port) if not no_nanny: yield gen.with_timeout(timedelta(seconds=2), All([scheduler.unregister(address=n.worker_address, close=True) for n in nannies if n.process and n.worker_port]), io_loop=loop2) loop2.run_sync(f) if not no_nanny: for n in nannies: n.process.terminate() if not no_nanny: start = time() while (any(isalive(n.process) for n in nannies) and time() < start + 1): sleep(0.1) for nanny in nannies: nanny.stop()
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, temp_filename, reconnect, resources, bokeh, bokeh_port): if nanny: port = nanny_port else: port = worker_port try: scheduler_host, scheduler_port = scheduler.split(':') scheduler_ip = socket.gethostbyname(scheduler_host) scheduler_port = int(scheduler_port) except IndexError: logger.info("Usage: dask-worker scheduler_host:scheduler_port") if nprocs > 1 and worker_port != 0: logger.error("Failed to launch worker. You cannot use the --port argument when nprocs > 1.") exit(1) if nprocs > 1 and name: logger.error("Failed to launch worker. You cannot use the --name argument when nprocs > 1.") exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: services[('bokeh', bokeh_port)] = BokehWorker if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if host is not None: ip = socket.gethostbyname(host) else: # lookup the ip address of a local interface on a network that # reach the scheduler ip = get_ip(scheduler_ip, scheduler_port) nannies = [t(scheduler_ip, scheduler_port, ncores=nthreads, ip=ip, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, **kwargs) for i in range(nprocs)] for n in nannies: n.start(port) if t is Nanny: global_nannies.append(n) if temp_filename: @gen.coroutine def f(): while nannies[0].status != 'running': yield gen.sleep(0.01) import json msg = {'port': nannies[0].port, 'local_directory': nannies[0].local_dir} with open(temp_filename, 'w') as f: json.dump(msg, f) loop.add_callback(f) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") loop.close() loop2 = IOLoop() @gen.coroutine def f(): scheduler = rpc(ip=nannies[0].scheduler.ip, port=nannies[0].scheduler.port) if nanny: yield gen.with_timeout(timedelta(seconds=2), All([scheduler.unregister(address=n.worker_address, close=True) for n in nannies if n.process and n.worker_port]), io_loop=loop2) loop2.run_sync(f) if nanny: for n in nannies: if isalive(n.process): n.process.terminate() if nanny: start = time() while (any(isalive(n.process) for n in nannies) and time() < start + 1): sleep(0.1) for nanny in nannies: nanny.stop()