def test_broken_worker_during_computation(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def test_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) while True: line = proc.stderr.readline() if b'Bokeh UI' in line: break start = time() while True: try: for name in [socket.gethostname(), 'localhost', '127.0.0.1', get_ip()]: response = requests.get('http://%s:8787/status/' % name) assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test_nanny_worker_ports(loop): try: worker = Popen(['dworker', '127.0.0.1:8989', '--host', '127.0.0.1', '--worker-port', '8788', '--nanny-port', '8789'], stdout=PIPE, stderr=PIPE) sched = Popen(['dscheduler', '--port', '8989'], stdout=PIPE, stderr=PIPE) with Executor('127.0.0.1:8989', loop=loop) as e: start = time() while True: d = sync(e.loop, e.scheduler.identity) if d['workers']: break else: assert time() - start < 5 sleep(0.1) assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789 finally: with ignoring(Exception): w = rpc('127.0.0.1:8789') sync(loop, w.terminate) with ignoring(Exception): os.kill(sched.pid, signal.SIGINT) with ignoring(Exception): worker.kill()
def test_bokeh_non_standard_ports(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler', '--port', '3448', '--http-port', '4824', '--bokeh-port', '4832'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:3448') while True: line = proc.stderr.readline() if b'Bokeh UI' in line: break start = time() while True: try: response = requests.get('http://localhost:4832/status/') assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def cluster(nworkers=2): _port[0] += 1 cport = _port[0] center = Process(target=run_center, args=(cport,)) workers = [] for i in range(nworkers): _port[0] += 1 port = _port[0] proc = Process(target=run_worker, args=(port, cport), kwargs={'ncores': 1}) workers.append({'port': port, 'proc': proc}) center.start() for worker in workers: worker['proc'].start() sock = connect_sync('127.0.0.1', cport) while True: write_sync(sock, {'op': 'ncores'}) ncores = read_sync(sock) if len(ncores) == nworkers: break try: yield {'proc': center, 'port': cport}, workers finally: for port in [cport] + [w['port'] for w in workers]: with ignoring(socket.error): sock = connect_sync('127.0.0.1', port) write_sync(sock, dict(op='terminate', close=True)) response = read_sync(sock) sock.close() for proc in [center] + [w['proc'] for w in workers]: with ignoring(Exception): proc.terminate()
def cluster(nworkers=2): _port[0] += 1 cport = _port[0] center = Process(target=run_center, args=(cport,)) workers = [] for i in range(nworkers): _port[0] += 1 port = _port[0] proc = Process(target=run_worker, args=(port, cport), kwargs={"ncores": 1}) workers.append({"port": port, "proc": proc}) center.start() for worker in workers: worker["proc"].start() sock = connect_sync("127.0.0.1", cport) while True: write_sync(sock, {"op": "ncores"}) ncores = read_sync(sock) if len(ncores) == nworkers: break try: yield {"proc": center, "port": cport}, workers finally: for port in [cport] + [w["port"] for w in workers]: with ignoring(socket.error): sock = connect_sync("127.0.0.1", port) write_sync(sock, dict(op="terminate", close=True)) response = read_sync(sock) sock.close() for proc in [center] + [w["proc"] for w in workers]: with ignoring(Exception): proc.terminate()
def test_nanny_worker_ports(loop): try: worker = Popen([ 'dworker', '127.0.0.1:8989', '--host', '127.0.0.1', '--worker-port', '8788', '--nanny-port', '8789' ], stdout=PIPE, stderr=PIPE) sched = Popen(['dscheduler', '--port', '8989'], stdout=PIPE, stderr=PIPE) with Executor('127.0.0.1:8989', loop=loop) as e: start = time() while True: d = sync(e.loop, e.scheduler.identity) if d['workers']: break else: assert time() - start < 5 sleep(0.1) assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789 finally: with ignoring(Exception): w = rpc('127.0.0.1:8789') sync(loop, w.terminate) with ignoring(Exception): os.kill(sched.pid, signal.SIGINT) with ignoring(Exception): worker.kill()
def test_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) while True: line = proc.stderr.readline() if b'Start Bokeh UI' in line: break start = time() while True: try: for name in [ socket.gethostname(), 'localhost', '127.0.0.1', get_ip() ]: response = requests.get('http://%s:8787/status/' % name) assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random() / 2) with ignoring( CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def test_no_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) for i in range(3): assert b'bokeh' not in next(proc.stderr) finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
async def start_scheduler(gateway, security, exit_on_failure=True): loop = IOLoop.current() plugin = GatewaySchedulerPlugin(gateway, loop) services = {("gateway", 0): (GatewaySchedulerService, {"plugin": plugin})} dashboard = False with ignoring(ImportError): from distributed.dashboard.scheduler import BokehScheduler services[("dashboard", 0)] = (BokehScheduler, {}) dashboard = True scheduler = Scheduler(loop=loop, services=services, security=security) scheduler.add_plugin(plugin) await scheduler host = urlparse(scheduler.address).hostname gateway_port = scheduler.services["gateway"].port api_address = "http://%s:%d" % (host, gateway_port) if dashboard: dashboard_port = scheduler.services["dashboard"].port dashboard_address = "http://%s:%d" % (host, dashboard_port) else: dashboard_address = "" try: await gateway.send_addresses(scheduler.address, dashboard_address, api_address) except Exception as exc: logger.error("Failed to send addresses to gateway", exc_info=exc) if exit_on_failure: sys.exit(1) return scheduler
def f(): nn = rpc(ip=n.ip, port=n.port) yield n._start() ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data={'x': 1, 'y': 2}) with ignoring(StreamClosedError): yield ww.compute(function=sys.exit, args=(0,), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores: yield gen.sleep(0.01) assert time() - start < 2 yield n._close() c.stop()
def adapt(self, **kwargs): """ Turn on adaptivity For keyword arguments see dask_drmaa.adaptive.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') See Also -------- Cluster: an interface for other clusters to inherit from """ from .adaptive import Adaptive with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, '_adaptive_options'): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive( self, self.scheduler, **self._adaptive_options ) return self._adaptive
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = yield Nanny(s.address, nthreads=2, loop=s.loop) start = time() while len(s.nthreads) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01, key=["f%d" % i for i in range(20)]) yield wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[n_worker_address]) while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n.close()
def scale_cb(b): with log_errors(): n = request.value with ignoring(AttributeError): self._adaptive.stop() self.scale(n) update()
def test_failed_worker_without_warning(c, s, a, b): L = c.map(inc, range(10)) yield wait(L) original_pid = a.pid with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[a.worker_address]) start = time() while a.pid == original_pid: yield gen.sleep(0.01) assert time() - start < 10 yield gen.sleep(0.5) start = time() while len(s.ncores) < 2: yield gen.sleep(0.01) assert time() - start < 10 yield wait(L) L2 = c.map(inc, range(10, 20)) yield wait(L2) assert all(len(keys) > 0 for keys in s.has_what.values()) ncores2 = dict(s.ncores) yield c._restart() L = c.map(inc, range(10)) yield wait(L) assert all(len(keys) > 0 for keys in s.has_what.values()) assert not (set(ncores2) & set(s.ncores)) # no overlap
def f(): nn = rpc(ip=n.ip, port=n.port) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data={'x': 1, 'y': 2}) with ignoring(StreamClosedError): yield ww.compute(function=sys.exit, args=(0,), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir c.stop()
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01, key=['f%d' % i for i in range(20)]) yield wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[n_worker_address]) while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n._close()
def test_bokeh(loop): from distributed.http import HTTPScheduler import requests with LocalCluster( scheduler_port=0, silence_logs=False, loop=loop, diagnostics_port=4724, services={('http', 0): HTTPScheduler}, ) as c: start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:%d/status/' % c.diagnostics.port) if response.ok: break assert time() < start + 20 sleep(0.01) start = time() while not raises( lambda: requests.get('http://127.0.0.1:%d/status/' % 4724)): assert time() < start + 10 sleep(0.01)
def test_failed_worker_without_warning(c, s, a, b): L = c.map(inc, range(10)) yield wait(L) original_pid = a.pid with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[a.worker_address]) start = time() while a.pid == original_pid: yield gen.sleep(0.01) assert time() - start < 10 yield gen.sleep(0.5) start = time() while len(s.nthreads) < 2: yield gen.sleep(0.01) assert time() - start < 10 yield wait(L) L2 = c.map(inc, range(10, 20)) yield wait(L2) assert all(len(keys) > 0 for keys in s.has_what.values()) nthreads2 = dict(s.nthreads) yield c.restart() L = c.map(inc, range(10)) yield wait(L) assert all(len(keys) > 0 for keys in s.has_what.values()) assert not (set(nthreads2) & set(s.nthreads)) # no overlap
async def start_scheduler( gateway, security, adaptive_period=3, heartbeat_period=15, idle_timeout=0, exit_on_failure=True, ): loop = IOLoop.current() services = { ("gateway", 0): ( GatewaySchedulerService, { "gateway": gateway, "adaptive_period": adaptive_period, "heartbeat_period": heartbeat_period, "idle_timeout": idle_timeout, }, ) } with ignoring(ImportError): from distributed.dashboard.scheduler import BokehScheduler services[("dashboard", 0)] = (BokehScheduler, {}) scheduler = Scheduler(loop=loop, services=services, security=security) return await scheduler
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh, bokeh_whitelist, prefix, use_xheaders, pid_file): if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {('http', http_port): HTTPScheduler} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_internal_port)] = BokehScheduler scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) bokeh_proc = None if _bokeh: try: from distributed.bokeh.application import BokehWebInterface bokeh_proc = BokehWebInterface(http_port=http_port, tcp_port=scheduler.port, bokeh_port=bokeh_port, bokeh_whitelist=bokeh_whitelist, show=show, prefix=prefix, use_xheaders=use_xheaders, quiet=False) except ImportError: logger.info("Please install Bokeh to get Web UI") except Exception as e: logger.warn("Could not start Bokeh web UI", exc_info=True) logger.info('-' * 47) try: loop.start() loop.close() finally: scheduler.stop() if bokeh_proc: bokeh_proc.close() logger.info("End scheduler at %r", addr)
def request_cb(b): with log_errors(): arg = request.value with ignoring(AttributeError): self._adaptive.stop() local_kwargs = dict() local_kwargs[kwarg] = arg self.scale(**local_kwargs)
def cluster(nworkers=2, nanny=False): if nanny: _run_worker = run_nanny else: _run_worker = run_worker _port[0] += 1 cport = _port[0] center = Process(target=run_center, args=(cport,)) workers = [] for i in range(nworkers): _port[0] += 1 port = _port[0] proc = Process(target=_run_worker, args=(port, cport), kwargs={'ncores': 1, 'local_dir': '_test_worker-%d' % port}) workers.append({'port': port, 'proc': proc}) center.start() for worker in workers: worker['proc'].start() sock = connect_sync('127.0.0.1', cport) start = time() try: while True: write_sync(sock, {'op': 'ncores'}) ncores = read_sync(sock) if len(ncores) == nworkers: break if time() - start > 5: raise Exception("Timeout on cluster creation") yield {'proc': center, 'port': cport}, workers finally: logger.debug("Closing out test cluster") for port in [cport] + [w['port'] for w in workers]: with ignoring(socket.error): sock = connect_sync('127.0.0.1', port) write_sync(sock, dict(op='terminate', close=True)) response = read_sync(sock) sock.close() for proc in [center] + [w['proc'] for w in workers]: with ignoring(Exception): proc.terminate() for fn in glob('_test_worker-*'): shutil.rmtree(fn)
def stop_workers(self, workers): if not workers: return workers = list(map(int, workers)) jobs = [self.jobs[w] for w in workers] self._call([self._cancelcmd] + list(jobs)) for w in workers: with ignoring(KeyError): del self.jobs[w]
def stop_workers(self, workers): """ Stop a list of workers""" if not workers: return workers = list(map(int, workers)) jobs = [self.jobs[w] for w in workers] self._call([self.cancel_command] + list(jobs)) for w in workers: with ignoring(KeyError): del self.jobs[w]
def g(): c = Center('127.0.0.1', 8017) c.listen(c.port) a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=1) yield a._start() b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1) yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: yield f(c, a, b) finally: with ignoring(Exception): yield a._close() with ignoring(Exception): yield b._close() c.stop()
def g(): c = Center('127.0.0.1', 8017) c.listen(c.port) a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1) yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: yield f(c, a, b) finally: with ignoring(): yield a._close() with ignoring(): yield b._close() c.stop()
def cluster(nworkers=2, nanny=False): if nanny: _run_worker = run_nanny else: _run_worker = run_worker _port[0] += 1 cport = _port[0] center = Process(target=run_center, args=(cport,)) workers = [] for i in range(nworkers): _port[0] += 1 port = _port[0] proc = Process(target=_run_worker, args=(port, cport), kwargs={"ncores": 1}) workers.append({"port": port, "proc": proc}) center.start() for worker in workers: worker["proc"].start() sock = connect_sync("127.0.0.1", cport) start = time() try: while True: write_sync(sock, {"op": "ncores"}) ncores = read_sync(sock) if len(ncores) == nworkers: break if time() - start > 5: raise Exception("Timeout on cluster creation") yield {"proc": center, "port": cport}, workers finally: logger.debug("Closing out test cluster") for port in [cport] + [w["port"] for w in workers]: with ignoring(socket.error): sock = connect_sync("127.0.0.1", port) write_sync(sock, dict(op="terminate", close=True)) response = read_sync(sock) sock.close() for proc in [center] + [w["proc"] for w in workers]: with ignoring(Exception): proc.terminate()
def test_active_holds_tasks(e, s, w): future = e.submit(slowinc, 1, delay=0.2) yield gen.sleep(0.1) assert future.key in w.active yield future._result() assert future.key not in w.active future = e.submit(throws, 1) with ignoring(Exception): yield _wait([future]) assert not w.active
def _close(self): if self.status == 'closed': return logging.info('Stopping workers...') self.workers.close() with ignoring(gen.TimeoutError, CommClosedError, OSError): logging.info('Stopping scheduler...') yield self.scheduler.close(fast=True) self.status = 'closed'
def adapt(self, **kwargs): with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, '_adaptive_options'): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = _ImprovedAdaptive(self.scheduler, self, **self._adaptive_options) return self._adaptive
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.address, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=["inc-%d-%d" % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map( slowadd, *zip(*partition_all(2, L)), key=["add-%d-%d" % (i, j) for j in range(len(L) // 2)] ) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring( CommClosedError, EnvironmentError ): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n.close()
def test_stress_scatter_death(c, s, *workers): import random s.allowed_failures = 1000 np = pytest.importorskip("numpy") L = yield c.scatter([np.random.random(10000) for i in range(len(workers))]) yield c.replicate(L, n=2) adds = [ delayed(slowadd, pure=True)( random.choice(L), random.choice(L), delay=0.05, dask_key_name="slowadd-1-%d" % i, ) for i in range(50) ] adds = [ delayed(slowadd, pure=True)(a, b, delay=0.02, dask_key_name="slowadd-2-%d" % i) for i, (a, b) in enumerate(sliding_window(2, adds)) ] futures = c.compute(adds) L = adds = None alive = list(workers) from distributed.scheduler import logger for i in range(7): yield gen.sleep(0.1) try: s.validate_state() except Exception as c: logger.exception(c) if config.get("log-on-err"): import pdb pdb.set_trace() else: raise w = random.choice(alive) yield w.close() alive.remove(w) with ignoring(CancelledError): yield c.gather(futures) futures = None
def g(): c = Center("127.0.0.1", 8017) c.listen(c.port) a = Worker("127.0.0.2", 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker("127.0.0.3", 8019, c.ip, c.port, ncores=1) yield b._start() start = time() try: while len(c.ncores) < 2: yield gen.sleep(0.01) if time() - start > 5: raise Exception("Cluster creation timeout") yield f(c, a, b) finally: logger.debug("Closing out test cluster") with ignoring(): yield a._close() with ignoring(): yield b._close() c.stop()
def adapt(self, minimum_cores=None, maximum_cores=None, minimum_memory=None, maximum_memory=None, **kwargs): """ Turn on adaptivity For keyword arguments see dask.distributed.Adaptive Instead of minimum and maximum parameters which apply to the number of worker, If Cluster object implements jobqueue_worker_spec attribute, one can use the following parameters: Parameters ---------- minimum_cores: int Minimum number of cores for the cluster maximum_cores: int Maximum number of cores for the cluster minimum_memory: str Minimum amount of memory for the cluster maximum_memory: str Maximum amount of memory for the cluster Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') >>> cluster.adapt(minimum_cores=24, maximum_cores=96) >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB') """ with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, "_adaptive_options"): self._adaptive_options = {} if "minimum" not in kwargs: if minimum_cores is not None: kwargs["minimum"] = self._get_nb_workers_from_cores( minimum_cores) elif minimum_memory is not None: kwargs["minimum"] = self._get_nb_workers_from_memory( minimum_memory) if "maximum" not in kwargs: if maximum_cores is not None: kwargs["maximum"] = self._get_nb_workers_from_cores( maximum_cores) elif maximum_memory is not None: kwargs["maximum"] = self._get_nb_workers_from_memory( maximum_memory) self._adaptive_options.update(kwargs) self._adaptive = Adaptive(self.scheduler, self, **self._adaptive_options) return self._adaptive
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=['inc-%d-%d' % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map(slowadd, *zip(*partition_all(2, L)), key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)]) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring(CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n._close()
def cluster(nworkers=2): _port[0] += 1 cport = _port[0] center = Process(target=run_center, args=(cport, )) workers = [] for i in range(nworkers): _port[0] += 1 port = _port[0] proc = Process(target=run_worker, args=(port, cport), kwargs={'ncores': 1}) workers.append({'port': port, 'proc': proc}) center.start() for worker in workers: worker['proc'].start() sock = connect_sync('127.0.0.1', cport) while True: write_sync(sock, {'op': 'ncores'}) ncores = read_sync(sock) if len(ncores) == nworkers: break try: yield {'proc': center, 'port': cport}, workers finally: for port in [cport] + [w['port'] for w in workers]: with ignoring(socket.error): sock = connect_sync('127.0.0.1', port) write_sync(sock, dict(op='terminate', close=True)) response = read_sync(sock) sock.close() for proc in [center] + [w['proc'] for w in workers]: with ignoring(Exception): proc.terminate()
def test_start_diagnostics(loop): from distributed.http import HTTPScheduler import requests with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop) as c: c.start_diagnostics_server(show=False, port=3748) start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:%d/status/' % c.diagnostics.port) if response.ok: break assert time() < start + 20 sleep(0.01)
def scheduler(): # pragma: nocover app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = 'tcp://' loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): try: from distributed.dashboard.scheduler import BokehScheduler except ImportError: # Old import location from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) # Set dask.dashboard before dask.scheduler since the YarnCluster object # waits on dask.scheduler only if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() app_client.kv['dask.scheduler'] = scheduler.address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def test_BokehWebInterface(loop): with LocalCluster(2, loop=loop, scheduler_port=0, services={('http', 0): HTTPScheduler}) as c: w = BokehWebInterface( tcp_port=c.scheduler.port, http_port=c.scheduler.services['http'].port, bokeh_port=8787) start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:8787/status/') if response.ok: break assert time() < start + 5 sleep(0.01)
def test_start_diagnostics(loop): pytest.importorskip('bokeh') from distributed.http import HTTPScheduler import requests with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop, diagnostics_port=None) as c: c.start_diagnostics_server(show=False, port=3748) start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:%d/status/' % c.diagnostics.port) if response.ok: break assert time() < start + 20 sleep(0.01)
def test_bokeh(loop): from distributed.http import HTTPScheduler import requests with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop, diagnostics_port=4724, services={('http', 0): HTTPScheduler}) as c: start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:%d/status/' % c.diagnostics.port) if response.ok: break assert time() < start + 20 sleep(0.01) start = time() while not raises(lambda: requests.get('http://127.0.0.1:%d/status/' % 4724)): assert time() < start + 10 sleep(0.01)
def main(): app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = uri_from_host_port('', None, 0) loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) app_client.kv['dask.scheduler'] = scheduler.address.encode() if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def test_BokehWebInterface(loop): with LocalCluster(2, loop=loop, scheduler_port=0, services={('http', 0): HTTPScheduler}, diagnostics_port=None) as c: with pytest.raises(Exception): response = requests.get('http://127.0.0.1:8787/status/') with BokehWebInterface( scheduler_address=c.scheduler.address, http_port=c.scheduler.services['http'].port, bokeh_port=8787) as w: start = time() while True: with ignoring(Exception): response = requests.get('http://127.0.0.1:8787/status/') if response.ok: break assert time() < start + 5 sleep(0.01) with pytest.raises(Exception): response = requests.get('http://127.0.0.1:8787/status/')
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_address = n.worker_address ww = rpc(n.worker_address) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) pid = n.pid assert pid is not None with ignoring(CommClosedError): yield c._run(os._exit, 0, workers=[n.worker_address]) start = time() while n.pid == pid: # wait while process dies and comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 # assert n.worker_address != original_address # most likely start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def test_nanny_process_failure(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() c.stop()
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.process.poll() is None: # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() s.stop()
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield c._run(sys.exit, 0, workers=[n.worker_address]) start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def g(): c = Center('127.0.0.1', 8017) c.listen(c.port) a = Worker('127.0.0.2', 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker('127.0.0.3', 8019, c.ip, c.port, ncores=1) yield b._start() start = time() try: while len(c.ncores) < 2: yield gen.sleep(0.01) if time() - start > 5: raise Exception("Cluster creation timeout") yield f(c, a, b) finally: logger.debug("Closing out test cluster") for w in [a, b]: with ignoring(TimeoutError, StreamClosedError): yield w._close() if os.path.exists(w.local_dir): shutil.rmtree(w.local_dir) c.stop()
def test_non_anonymous_access(): with ignoring(NoCredentialsError): fs = S3FileSystem(anon=False) fs.ls('distributed-test')