def __init__(self, device_memory_limit=None, memory_limit=None, local_dir="dask-worker-space"): path = os.path.join(local_dir, "storage") self.host_func = dict() self.disk_func = Func(partial(serialize_bytes, on_error="raise"), deserialize_bytes, File(path)) self.host_buffer = Buffer(self.host_func, self.disk_func, memory_limit, weight=weight) self.device_func = dict() self.device_host_func = Func(_serialize_if_device, _deserialize_if_device, self.host_buffer) self.device_buffer = Buffer(self.device_func, self.device_host_func, device_memory_limit, weight=weight) self.device = self.device_buffer.fast.d self.host = self.host_buffer.fast.d self.disk = self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer.fast
def __init__( self, device_memory_limit=None, memory_limit=None, local_directory="dask-worker-space", ): path = os.path.join(local_directory, "storage") self.host_func = dict() self.disk_func = Func(serialize_bytelist, deserialize_bytes, File(path)) self.host_buffer = Buffer(self.host_func, self.disk_func, memory_limit, weight=weight) self.device_keys = set() self.device_func = dict() self.device_host_func = Func(device_to_host, host_to_device, self.host_buffer) self.device_buffer = Buffer(self.device_func, self.device_host_func, device_memory_limit, weight=weight) self.device = self.device_buffer.fast.d self.host = self.host_buffer.fast.d self.disk = self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer.fast
def test_simple(): a = dict() b = dict() buff = Buffer(a, b, n=10, weight=lambda k, v: v) buff["x"] = 1 buff["y"] = 2 assert buff["x"] == 1 assert buff["y"] == 2 assert a == {"x": 1, "y": 2} assert buff.fast.total_weight == 3 buff["z"] = 8 assert a == {"y": 2, "z": 8} assert b == {"x": 1} assert buff["x"] == 1 assert a == {"x": 1, "z": 8} assert b == {"y": 2} assert "x" in buff assert "y" in buff assert "missing" not in buff buff["y"] = 1 assert a == {"x": 1, "y": 1, "z": 8} assert buff.fast.total_weight == 10 assert b == {} del buff["z"] assert a == {"x": 1, "y": 1} assert buff.fast.total_weight == 2 assert b == {} del buff["y"] assert a == {"x": 1} assert buff.fast.total_weight == 1 assert b == {} assert "y" not in buff buff["a"] = 5 assert set(buff) == set(buff.keys()) == {"a", "x"} fast_keys = set(buff.fast) slow_keys = set(buff.slow) assert not (fast_keys & slow_keys) assert fast_keys | slow_keys == set(buff) # Overweight element stays in slow mapping buff["b"] = 1000 assert "b" in buff.slow assert set(buff.fast) == fast_keys assert set(buff.slow) == {"b"} | slow_keys assert "b" in buff assert buff["b"] == 1000
def test_simple(): a = dict() b = dict() buff = Buffer(a, b, n=10, weight=lambda k, v: v) buff['x'] = 1 buff['y'] = 2 assert buff['x'] == 1 assert buff['y'] == 2 assert a == {'x': 1, 'y': 2} assert buff.fast.total_weight == 3 buff['z'] = 8 assert a == {'y': 2, 'z': 8} assert b == {'x': 1} assert buff['x'] == 1 assert a == {'x': 1, 'z': 8} assert b == {'y': 2} assert 'x' in buff assert 'y' in buff assert 'missing' not in buff buff['y'] = 1 assert a == {'x': 1, 'y': 1, 'z': 8} assert buff.fast.total_weight == 10 assert b == {} del buff['z'] assert a == {'x': 1, 'y': 1} assert buff.fast.total_weight == 2 assert b == {} del buff['y'] assert a == {'x': 1} assert buff.fast.total_weight == 1 assert b == {} assert 'y' not in buff buff['a'] = 5 assert set(buff) == set(buff.keys()) == {'a', 'x'} fast_keys = set(buff.fast) slow_keys = set(buff.slow) assert not (fast_keys & slow_keys) assert fast_keys | slow_keys == set(buff) # Overweight element stays in slow mapping buff['b'] = 1000 assert 'b' in buff.slow assert set(buff.fast) == fast_keys assert set(buff.slow) == {'b'} | slow_keys assert 'b' in buff assert buff['b'] == 1000
def __init__( self, device_memory_limit=None, memory_limit=None, local_directory=None, log_spilling=False, ): self.disk_func_path = os.path.join( local_directory or dask.config.get("temporary-directory") or os.getcwd(), "dask-worker-space", "storage", ) os.makedirs(self.disk_func_path, exist_ok=True) self.host_func = dict() self.disk_func = Func( functools.partial(serialize_bytelist, on_error="raise"), deserialize_bytes, File(self.disk_func_path), ) host_buffer_kwargs = {} device_buffer_kwargs = {} buffer_class = Buffer if log_spilling is True: buffer_class = LoggedBuffer host_buffer_kwargs = {"fast_name": "Host", "slow_name": "Disk"} device_buffer_kwargs = {"fast_name": "Device", "slow_name": "Host"} if memory_limit == 0: self.host_buffer = self.host_func else: self.host_buffer = buffer_class( self.host_func, self.disk_func, memory_limit, weight=lambda k, v: safe_sizeof(v), **host_buffer_kwargs, ) self.device_keys = set() self.device_func = dict() self.device_host_func = Func(device_to_host, host_to_device, self.host_buffer) self.device_buffer = Buffer( self.device_func, self.device_host_func, device_memory_limit, weight=lambda k, v: safe_sizeof(v), **device_buffer_kwargs, ) self.device = self.device_buffer.fast.d self.host = self.host_buffer if memory_limit == 0 else self.host_buffer.fast.d self.disk = None if memory_limit == 0 else self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer if memory_limit == 0 else self.host_buffer.fast
def __init__( self, device_memory_limit=None, memory_limit=None, local_directory=None, jit_unspill=False, ): if local_directory is None: local_directory = dask.config.get( "temporary-directory") or os.getcwd() if not os.path.exists(local_directory): os.makedirs(local_directory, exist_ok=True) local_directory = os.path.join(local_directory, "dask-worker-space") self.disk_func_path = os.path.join(local_directory, "storage") self.host_func = dict() self.disk_func = Func( functools.partial(serialize_bytelist, on_error="raise"), deserialize_bytes, File(self.disk_func_path), ) if memory_limit == 0: self.host_buffer = self.host_func else: self.host_buffer = Buffer(self.host_func, self.disk_func, memory_limit, weight=weight) self.device_keys = set() self.device_func = dict() if jit_unspill: self.device_host_func = Func(pxy_obj_device_to_host, pxy_obj_host_to_device, self.host_buffer) else: self.device_host_func = Func(device_to_host, host_to_device, self.host_buffer) self.device_buffer = Buffer(self.device_func, self.device_host_func, device_memory_limit, weight=weight) self.device = self.device_buffer.fast.d self.host = self.host_buffer if memory_limit == 0 else self.host_buffer.fast.d self.disk = None if memory_limit == 0 else self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer if memory_limit == 0 else self.host_buffer.fast
def test_mapping(): """ Test mapping interface for Buffer(). """ a = {} b = {} buff = Buffer(a, b, n=2) utils_test.check_mapping(buff) utils_test.check_closing(buff)
def test_simple(): a = dict() b = dict() buff = Buffer(a, b, n=10, weight=lambda k, v: v) buff['x'] = 1 buff['y'] = 2 assert buff['x'] == 1 assert buff['y'] == 2 assert a == {'x': 1, 'y': 2} assert buff.fast.total_weight == 3 buff['z'] = 8 assert a == {'y': 2, 'z': 8} assert b == {'x': 1} assert buff['x'] == 1 assert a == {'x': 1, 'z': 8} assert b == {'y': 2} assert 'x' in buff assert 'y' in buff assert 'missing' not in buff del buff['z'] assert a == {'x': 1} assert b == {'y': 2} del buff['y'] assert a == {'x': 1} assert b == {} assert 'y' not in buff buff['a'] = 5 assert set(buff) == set(buff.keys()) == {'a', 'x'} fast_keys = set(buff.fast) buff['b'] = 1000 assert 'b' in buff.slow assert set(buff.fast) == fast_keys
def __init__( self, device_memory_limit=None, memory_limit=None, local_directory=None, ): if local_directory is None: local_directory = dask.config.get( "temporary-directory") or os.getcwd() os.makedirs(local_directory, exist_ok=True) local_directory = os.path.join(local_directory, "dask-worker-space") self.disk_func_path = os.path.join(local_directory, "storage") self.host_func = dict() self.disk_func = Func(serialize_bytelist, deserialize_bytes, File(self.disk_func_path)) self.host_buffer = Buffer(self.host_func, self.disk_func, memory_limit, weight=weight) self.device_keys = set() self.device_func = dict() self.device_host_func = Func(device_to_host, host_to_device, self.host_buffer) self.device_buffer = Buffer(self.device_func, self.device_host_func, device_memory_limit, weight=weight) self.device = self.device_buffer.fast.d self.host = self.host_buffer.fast.d self.disk = self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer.fast
def setup(self, worker): self.cache = Buffer( fast={}, slow=Func( dump=blosc.pack_array, load=blosc.unpack_array, d=Buffer( fast={}, slow=LRU( n=self._maxdisk, d=File(os.path.join(worker.local_directory, 'cache')), weight=lambda k, v: len(v), ), n=self._maxcompressed, weight=lambda k, v: len(v), ), ), n=self._maxmem, weight=lambda k, v: v.nbytes, ) self.lock = Lock() self.hits = 0 self.misses = 0
def test_callbacks(): f2s = [] def f2s_cb(k, v): f2s.append(k) s2f = [] def s2f_cb(k, v): s2f.append(k) a = dict() b = dict() buff = Buffer( a, b, n=10, weight=lambda k, v: v, fast_to_slow_callbacks=f2s_cb, slow_to_fast_callbacks=s2f_cb, ) buff["x"] = 1 buff["y"] = 2 assert buff["x"] == 1 assert buff["y"] == 2 assert not f2s assert not s2f buff["z"] = 8 assert f2s == ["x"] assert s2f == [] buff["z"] assert f2s == ["x"] assert s2f == [] buff["x"] assert f2s == ["x", "y"] assert s2f == ["x"]
def test_setitem_avoid_fast_slow_duplicate(): a = dict() b = dict() buff = Buffer(a, b, n=10, weight=lambda k, v: v) for first, second in [(1, 12), (12, 1)]: buff["a"] = first assert buff["a"] == first buff["a"] = second assert buff["a"] == second fast_keys = set(buff.fast) slow_keys = set(buff.slow) assert not (fast_keys & slow_keys) assert fast_keys | slow_keys == set(buff) del buff["a"] assert "a" not in buff assert "a" not in a assert "a" not in b
class Worker(Server): """ Worker Node Workers perform two functions: 1. **Serve data** from a local dictionary 2. **Perform computation** on that data and on data from peers Additionally workers keep a scheduler informed of their data and use that scheduler to gather data from other workers when necessary to perform a computation. You can start a worker with the ``dask-worker`` command line application:: $ dask-worker scheduler-ip:port **State** * **data:** ``{key: object}``: Dictionary mapping keys to actual values * **active:** ``{key}``: Set of keys currently under computation * **ncores:** ``int``: Number of cores used by this worker process * **executor:** ``concurrent.futures.ThreadPoolExecutor``: Executor used to perform computation * **local_dir:** ``path``: Path on local machine to store temporary files * **scheduler:** ``rpc``: Location of scheduler. See ``.ip/.port`` attributes. * **name:** ``string``: Alias * **services:** ``{str: Server}``: Auxiliary web servers running on this worker * **service_ports:** ``{str: port}``: Examples -------- Create schedulers and workers in Python: >>> from distributed import Scheduler, Worker >>> c = Scheduler('192.168.0.100', 8786) # doctest: +SKIP >>> w = Worker(c.ip, c.port) # doctest: +SKIP >>> yield w._start(port=8786) # doctest: +SKIP Or use the command line:: $ dask-scheduler Start scheduler at 127.0.0.1:8786 $ dask-worker 127.0.0.1:8786 Start worker at: 127.0.0.1:8786 Registered with scheduler at: 127.0.0.1:8787 See Also -------- distributed.scheduler.Scheduler: """ def __init__(self, scheduler_ip, scheduler_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, heartbeat_interval=1000, memory_limit=None, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) if memory_limit: try: from zict import Buffer, File, Func except ImportError: raise ImportError("Please `pip install zict` for spill-to-disk workers") path = os.path.join(self.local_dir, 'storage') storage = Func(dumps_to_disk, loads_from_disk, File(path)) self.data = Buffer({}, storage, int(float(memory_limit)), weight) else: self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.executor = ThreadPoolExecutor(self.ncores) self.scheduler = rpc(ip=scheduler_ip, port=scheduler_port) self.active = set() self.name = name self.heartbeat_interval = heartbeat_interval self.heartbeat_active = False self.execution_state = {'scheduler': self.scheduler.address, 'ioloop': self.loop, 'worker': self} self._last_disk_io = None self._last_net_io = None self._ipython_kernel = None if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self, io_loop=self.loop) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = {'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.host_health, 'upload_file': self.upload_file, 'start_ipython': self.start_ipython, 'keys': self.keys, } super(Worker, self).__init__(handlers, io_loop=self.loop, **kwargs) self.heartbeat_callback = PeriodicCallback(self.heartbeat, self.heartbeat_interval, io_loop=self.loop) self.loop.add_callback(self.heartbeat_callback.start) @gen.coroutine def heartbeat(self): if not self.heartbeat_active: self.heartbeat_active = True logger.debug("Heartbeat: %s" % self.address) try: yield self.scheduler.register(address=self.address, name=self.name, ncores=self.ncores, now=time(), info=self.process_health(), host_info=self.host_health(), services=self.service_ports, **self.process_health()) finally: self.heartbeat_active = False else: logger.debug("Heartbeat skipped: channel busy") @gen.coroutine def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.scheduler.ip, self.scheduler.port) while True: try: resp = yield self.scheduler.register( ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), name=self.name, nbytes=valmap(sizeof, self.data), now=time(), host_info=self.host_health(), services=self.service_ports, **self.process_health()) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.scheduler.ip, self.scheduler.port) self.status = 'running' def start(self, port=0): self.loop.add_callback(self._start, port) def identity(self, stream): return {'type': type(self).__name__, 'id': self.id, 'scheduler': (self.scheduler.ip, self.scheduler.port)} @gen.coroutine def _close(self, report=True, timeout=10): self.heartbeat_callback.stop() with ignoring(RPCClosed, StreamClosedError): if report: yield gen.with_timeout(timedelta(seconds=timeout), self.scheduler.unregister(address=(self.ip, self.port)), io_loop=self.loop) self.scheduler.close_rpc() self.stop() self.executor.shutdown() if os.path.exists(self.local_dir): shutil.rmtree(self.local_dir) for k, v in self.services.items(): v.stop() self.status = 'closed' self.stop() @gen.coroutine def terminate(self, stream, report=True): yield self._close(report=report) raise Return('OK') @property def address(self): return '%s:%d' % (self.ip, self.port) @property def address_tuple(self): return (self.ip, self.port) @gen.coroutine def gather(self, stream=None, who_has=None): who_has = {k: [coerce_to_address(addr) for addr in v] for k, v in who_has.items() if k not in self.data} try: result = yield gather_from_workers(who_has) except KeyError as e: logger.warn("Could not find data", e) raise Return({'status': 'missing-data', 'keys': e.args}) else: self.data.update(result) raise Return({'status': 'OK'}) def deserialize(self, function=None, args=None, kwargs=None, task=None): """ Deserialize task inputs and regularize to func, args, kwargs """ if task is not None: task = loads(task) if function is not None: function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) if task is not None: assert not function and not args and not kwargs function = execute_task args = (task,) return function, args or (), kwargs or {} @gen.coroutine def gather_many(self, msgs): """ Gather the data for many compute messages at once Returns ------- good: the input messages for which we have data bad: a dict of task keys for which we could not find data data: The scope in which to run tasks len(remote): the number of new keys we've gathered """ diagnostics = {} who_has = merge(msg['who_has'] for msg in msgs if 'who_has' in msg) start = time() local = {k: self.data[k] for k in who_has if k in self.data} stop = time() if stop - start > 0.005: diagnostics['disk_load_start'] = start diagnostics['disk_load_stop'] = stop who_has = {k: v for k, v in who_has.items() if k not in local} start = time() remote, bad_data = yield gather_from_workers(who_has, permissive=True) if remote: self.data.update(remote) yield self.scheduler.add_keys(address=self.address, keys=list(remote)) stop = time() if remote: diagnostics['transfer_start'] = start diagnostics['transfer_stop'] = stop data = merge(local, remote) if bad_data: missing = {msg['key']: {k for k in msg['who_has'] if k in bad_data} for msg in msgs if 'who_has' in msg} bad = {k: v for k, v in missing.items() if v} good = [msg for msg in msgs if not missing.get(msg['key'])] else: good, bad = msgs, {} raise Return([good, bad, data, len(remote), diagnostics]) @gen.coroutine def _ready_task(self, function=None, key=None, args=(), kwargs={}, task=None, who_has=None): who_has = who_has or {} diagnostics = {} start = time() data = {k: self.data[k] for k in who_has if k in self.data} stop = time() if stop - start > 0.005: diagnostics['disk_load_start'] = start diagnostics['disk_load_stop'] = stop who_has = {k: set(map(coerce_to_address, v)) for k, v in who_has.items() if k not in self.data} if who_has: try: logger.info("gather %d keys from peers", len(who_has)) diagnostics['transfer_start'] = time() other = yield gather_from_workers(who_has) diagnostics['transfer_stop'] = time() self.data.update(other) yield self.scheduler.add_keys(address=self.address, keys=list(other)) data.update(other) except KeyError as e: logger.warn("Could not find data for %s", key) raise Return({'status': 'missing-data', 'keys': e.args, 'key': key}) try: start = default_timer() function, args, kwargs = self.deserialize(function, args, kwargs, task) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) emsg = error_message(e) emsg['key'] = key raise Return(emsg) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) raise Return({'status': 'OK', 'function': function, 'args': args2, 'kwargs': kwargs2, 'diagnostics': diagnostics, 'key': key}) @gen.coroutine def executor_submit(self, key, function, *args, **kwargs): """ Safely run function in thread pool executor We've run into issues running concurrent.future futures within tornado. Apparently it's advantageous to use timeouts and periodic callbacks to ensure things run smoothly. This can get tricky, so we pull it off into an separate method. """ job_counter[0] += 1 # logger.info("%s:%d Starts job %d, %s", self.ip, self.port, i, key) future = self.executor.submit(function, *args, **kwargs) pc = PeriodicCallback(lambda: logger.debug("future state: %s - %s", key, future._state), 1000, io_loop=self.loop); pc.start() try: yield future finally: pc.stop() pass result = future.result() # logger.info("Finish job %d, %s", i, key) raise gen.Return(result) @gen.coroutine def compute_stream(self, stream): with log_errors(): logger.debug("Open compute stream") bstream = BatchedSend(interval=2, loop=self.loop) bstream.start(stream) closed = False last = gen.sleep(0) while not closed: try: msgs = yield read(stream) except StreamClosedError: break if not isinstance(msgs, list): msgs = [msgs] batch = [] for msg in msgs: op = msg.pop('op', None) if op == 'close': closed = True break elif op == 'compute-task': batch.append(msg) logger.debug("%s asked to compute %s", self.address, msg['key']) else: logger.warning("Unknown operation %s, %s", op, msg) # self.loop.add_callback(self.compute_many, bstream, msgs) last = self.compute_many(bstream, msgs) try: yield last # TODO: there might be more than one lingering except (RPCClosed, RuntimeError): pass yield bstream.close() logger.info("Close compute stream") @gen.coroutine def compute_many(self, bstream, msgs, report=False): good, bad, data, num_transferred, diagnostics = yield self.gather_many(msgs) for msg in msgs: msg.pop('who_has', None) if bad: logger.warn("Could not find data for %s", sorted(bad)) for k, v in bad.items(): bstream.send({'status': 'missing-data', 'key': k, 'keys': list(v)}) if good: futures = [self.compute_one(data, report=report, **msg) for msg in good] wait_iterator = gen.WaitIterator(*futures) result = yield wait_iterator.next() if diagnostics: result.update(diagnostics) bstream.send(result) while not wait_iterator.done(): msg = yield wait_iterator.next() bstream.send(msg) @gen.coroutine def compute_one(self, data, key=None, function=None, args=None, kwargs=None, report=False, task=None): logger.debug("Compute one on %s", key) self.active.add(key) diagnostics = dict() try: start = default_timer() function, args, kwargs = self.deserialize(function, args, kwargs, task) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) emsg = error_message(e) emsg['key'] = key raise Return(emsg) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args2, kwargs2, self.execution_state, key) result['key'] = key result.update(diagnostics) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.scheduler.add_keys(keys=[key], address=(self.ip, self.port)) if not response == 'OK': logger.warn('Could not report results to scheduler: %s', str(response)) else: logger.warn(" Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], convert_args_to_str(args, max_len=1000), convert_kwargs_to_str(kwargs, max_len=1000), exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, result) try: self.active.remove(key) except KeyError: pass raise Return(result) @gen.coroutine def compute(self, stream=None, function=None, key=None, args=(), kwargs={}, task=None, who_has=None, report=True): """ Execute function """ self.active.add(key) # Ready function for computation msg = yield self._ready_task(function=function, key=key, args=args, kwargs=kwargs, task=task, who_has=who_has) if msg['status'] != 'OK': try: self.active.remove(key) except KeyError: pass raise Return(msg) else: function = msg['function'] args = msg['args'] kwargs = msg['kwargs'] # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args, kwargs, self.execution_state, key) result['key'] = key result.update(msg['diagnostics']) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.scheduler.add_keys(address=(self.ip, self.port), keys=[key]) if not response == 'OK': logger.warn('Could not report results to scheduler: %s', str(response)) else: logger.warn(" Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], convert_args_to_str(args, max_len=1000), convert_kwargs_to_str(kwargs, max_len=1000), exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, get_msg_safe_str(msg)) try: self.active.remove(key) except KeyError: pass raise Return(result) def run(self, stream, function=None, args=(), kwargs={}): return run(self, stream, function=function, args=args, kwargs=kwargs) @gen.coroutine def update_data(self, stream=None, data=None, report=True, deserialize=True): if deserialize: data = valmap(loads, data) self.data.update(data) if report: response = yield self.scheduler.add_keys( address=(self.ip, self.port), keys=list(data)) assert response == 'OK' info = {'nbytes': {k: sizeof(v) for k, v in data.items()}, 'status': 'OK'} raise Return(info) @gen.coroutine def delete_data(self, stream, keys=None, report=True): if keys: for key in keys: if key in self.data: del self.data[key] logger.info("Deleted %d keys", len(keys)) if report: logger.debug("Reporting loss of keys to scheduler") yield self.scheduler.remove_keys(address=self.address, keys=list(keys)) raise Return('OK') def get_data(self, stream, keys=None): return {k: dumps(self.data[k]) for k in keys if k in self.data} def start_ipython(self, stream): """Start an IPython kernel Returns Jupyter connection info dictionary. """ from ._ipython_utils import start_ipython if self._ipython_kernel is None: self._ipython_kernel = start_ipython( ip=self.ip, ns={'worker': self}, log=logger, ) return self._ipython_kernel.get_connection_info() def upload_file(self, stream, filename=None, data=None, load=True): out_filename = os.path.join(self.local_dir, filename) if isinstance(data, unicode): data = data.encode() with open(out_filename, 'wb') as f: f.write(data) f.flush() if load: try: name, ext = os.path.splitext(filename) if ext in ('.py', '.pyc'): logger.info("Reload module %s from .py file", name) name = name.split('-')[0] reload(import_module(name)) if ext == '.egg': sys.path.append(out_filename) pkgs = pkg_resources.find_distributions(out_filename) for pkg in pkgs: logger.info("Load module %s from egg", pkg.project_name) reload(import_module(pkg.project_name)) if not pkgs: logger.warning("Found no packages in egg file") except Exception as e: logger.exception(e) return {'status': 'error', 'exception': dumps(e)} return {'status': 'OK', 'nbytes': len(data)} def process_health(self, stream=None): d = {'active': len(self.active), 'stored': len(self.data)} return d def host_health(self, stream=None): """ Information about worker """ d = {'time': time()} try: import psutil mem = psutil.virtual_memory() d.update({'cpu': psutil.cpu_percent(), 'memory': mem.total, 'memory_percent': mem.percent}) net_io = psutil.net_io_counters() if self._last_net_io: d['network-send'] = net_io.bytes_sent - self._last_net_io.bytes_sent d['network-recv'] = net_io.bytes_recv - self._last_net_io.bytes_recv else: d['network-send'] = 0 d['network-recv'] = 0 self._last_net_io = net_io try: disk_io = psutil.disk_io_counters() except RuntimeError: # This happens when there is no physical disk in worker pass else: if self._last_disk_io: d['disk-read'] = disk_io.read_bytes - self._last_disk_io.read_bytes d['disk-write'] = disk_io.write_bytes - self._last_disk_io.write_bytes else: d['disk-read'] = 0 d['disk-write'] = 0 self._last_disk_io = disk_io except ImportError: pass return d def keys(self, stream=None): return list(self.data)
def __init__(self, scheduler_ip, scheduler_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, heartbeat_interval=1000, memory_limit=None, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) if memory_limit: try: from zict import Buffer, File, Func except ImportError: raise ImportError("Please `pip install zict` for spill-to-disk workers") path = os.path.join(self.local_dir, 'storage') storage = Func(dumps_to_disk, loads_from_disk, File(path)) self.data = Buffer({}, storage, int(float(memory_limit)), weight) else: self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.executor = ThreadPoolExecutor(self.ncores) self.scheduler = rpc(ip=scheduler_ip, port=scheduler_port) self.active = set() self.name = name self.heartbeat_interval = heartbeat_interval self.heartbeat_active = False self.execution_state = {'scheduler': self.scheduler.address, 'ioloop': self.loop, 'worker': self} self._last_disk_io = None self._last_net_io = None self._ipython_kernel = None if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self, io_loop=self.loop) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = {'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.host_health, 'upload_file': self.upload_file, 'start_ipython': self.start_ipython, 'keys': self.keys, } super(Worker, self).__init__(handlers, io_loop=self.loop, **kwargs) self.heartbeat_callback = PeriodicCallback(self.heartbeat, self.heartbeat_interval, io_loop=self.loop) self.loop.add_callback(self.heartbeat_callback.start)
class DeviceHostFile(ZictBase): """Manages serialization/deserialization of objects. Three LRU cache levels are controlled, for device, host and disk. Each level takes care of serializing objects once its limit has been reached and pass it to the subsequent level. Similarly, each cache may deserialize the object, but storing it back in the appropriate cache, depending on the type of object being deserialized. Parameters ---------- device_memory_limit: int Number of bytes of CUDA device memory for device LRU cache, spills to host cache once filled. memory_limit: int Number of bytes of host memory for host LRU cache, spills to disk once filled. Setting this to 0 means unlimited host memory, implies no spilling to disk. local_directory: path Path where to store serialized objects on disk log_spilling: bool If True, all spilling operations will be logged directly to distributed.worker with an INFO loglevel. This will eventually be replaced by a Dask configuration flag. """ def __init__( self, device_memory_limit=None, memory_limit=None, local_directory=None, log_spilling=False, ): if local_directory is None: local_directory = dask.config.get( "temporary-directory") or os.getcwd() if local_directory and not os.path.exists(local_directory): os.makedirs(local_directory, exist_ok=True) local_directory = os.path.join(local_directory, "dask-worker-space") self.disk_func_path = os.path.join(local_directory, "storage") self.host_func = dict() self.disk_func = Func( functools.partial(serialize_bytelist, on_error="raise"), deserialize_bytes, File(self.disk_func_path), ) host_buffer_kwargs = {} device_buffer_kwargs = {} buffer_class = Buffer if log_spilling is True: buffer_class = LoggedBuffer host_buffer_kwargs = {"fast_name": "Host", "slow_name": "Disk"} device_buffer_kwargs = {"fast_name": "Device", "slow_name": "Host"} if memory_limit == 0: self.host_buffer = self.host_func else: self.host_buffer = buffer_class( self.host_func, self.disk_func, memory_limit, weight=lambda k, v: safe_sizeof(v), **host_buffer_kwargs, ) self.device_keys = set() self.device_func = dict() self.device_host_func = Func(device_to_host, host_to_device, self.host_buffer) self.device_buffer = Buffer( self.device_func, self.device_host_func, device_memory_limit, weight=lambda k, v: safe_sizeof(v), **device_buffer_kwargs, ) self.device = self.device_buffer.fast.d self.host = self.host_buffer if memory_limit == 0 else self.host_buffer.fast.d self.disk = None if memory_limit == 0 else self.host_buffer.slow.d # For Worker compatibility only, where `fast` is host memory buffer self.fast = self.host_buffer if memory_limit == 0 else self.host_buffer.fast def __setitem__(self, key, value): if key in self.device_buffer: # Make sure we register the removal of an existing key del self[key] if is_device_object(value): self.device_keys.add(key) self.device_buffer[key] = value else: self.host_buffer[key] = value def __getitem__(self, key): if key in self.device_keys: return self.device_buffer[key] elif key in self.host_buffer: return self.host_buffer[key] else: raise KeyError(key) def __len__(self): return len(self.device_buffer) def __iter__(self): return iter(self.device_buffer) def __delitem__(self, key): self.device_keys.discard(key) del self.device_buffer[key] def set_address(self, addr): if isinstance(self.host_buffer, LoggedBuffer): self.host_buffer.set_address(addr) self.device_buffer.set_address(addr) def get_total_spilling_time(self): ret = {} if isinstance(self.device_buffer, LoggedBuffer): ret = {**ret, **self.device_buffer.get_total_spilling_time()} if isinstance(self.host_buffer, LoggedBuffer): ret = {**ret, **self.host_buffer.get_total_spilling_time()} return ret
def __init__(self, scheduler_ip, scheduler_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, heartbeat_interval=5000, memory_limit=TOTAL_MEMORY, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) self.memory_limit = memory_limit if memory_limit: try: from zict import Buffer, File, Func except ImportError: raise ImportError("Please `pip install zict` for spill-to-disk workers") path = os.path.join(self.local_dir, 'storage') storage = Func(dumps_to_disk, loads_from_disk, File(path)) self.data = Buffer({}, storage, int(float(memory_limit)), weight) else: self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.executor = ThreadPoolExecutor(self.ncores) self.scheduler = rpc(ip=scheduler_ip, port=scheduler_port) self.active = set() self.name = name self.heartbeat_interval = heartbeat_interval self.heartbeat_active = False self.execution_state = {'scheduler': self.scheduler.address, 'ioloop': self.loop, 'worker': self} self._last_disk_io = None self._last_net_io = None self._ipython_kernel = None if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self, io_loop=self.loop) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = {'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.host_health, 'upload_file': self.upload_file, 'start_ipython': self.start_ipython, 'keys': self.keys, } super(Worker, self).__init__(handlers, io_loop=self.loop, **kwargs) self.heartbeat_callback = PeriodicCallback(self.heartbeat, self.heartbeat_interval, io_loop=self.loop) self.loop.add_callback(self.heartbeat_callback.start)
class Worker(Server): """ Worker Node Workers perform two functions: 1. **Serve data** from a local dictionary 2. **Perform computation** on that data and on data from peers Additionally workers keep a scheduler informed of their data and use that scheduler to gather data from other workers when necessary to perform a computation. You can start a worker with the ``dask-worker`` command line application:: $ dask-worker scheduler-ip:port **State** * **data:** ``{key: object}``: Dictionary mapping keys to actual values * **active:** ``{key}``: Set of keys currently under computation * **ncores:** ``int``: Number of cores used by this worker process * **executor:** ``concurrent.futures.ThreadPoolExecutor``: Executor used to perform computation * **local_dir:** ``path``: Path on local machine to store temporary files * **scheduler:** ``rpc``: Location of scheduler. See ``.ip/.port`` attributes. * **name:** ``string``: Alias * **services:** ``{str: Server}``: Auxiliary web servers running on this worker * **service_ports:** ``{str: port}``: Examples -------- Create schedulers and workers in Python: >>> from distributed import Scheduler, Worker >>> c = Scheduler('192.168.0.100', 8786) # doctest: +SKIP >>> w = Worker(c.ip, c.port) # doctest: +SKIP >>> yield w._start(port=8786) # doctest: +SKIP Or use the command line:: $ dask-scheduler Start scheduler at 127.0.0.1:8786 $ dask-worker 127.0.0.1:8786 Start worker at: 127.0.0.1:8786 Registered with scheduler at: 127.0.0.1:8787 See Also -------- distributed.scheduler.Scheduler: """ def __init__(self, scheduler_ip, scheduler_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, heartbeat_interval=5000, memory_limit=TOTAL_MEMORY, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) self.memory_limit = memory_limit if memory_limit: try: from zict import Buffer, File, Func except ImportError: raise ImportError("Please `pip install zict` for spill-to-disk workers") path = os.path.join(self.local_dir, 'storage') storage = Func(dumps_to_disk, loads_from_disk, File(path)) self.data = Buffer({}, storage, int(float(memory_limit)), weight) else: self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.executor = ThreadPoolExecutor(self.ncores) self.scheduler = rpc(ip=scheduler_ip, port=scheduler_port) self.active = set() self.name = name self.heartbeat_interval = heartbeat_interval self.heartbeat_active = False self.execution_state = {'scheduler': self.scheduler.address, 'ioloop': self.loop, 'worker': self} self._last_disk_io = None self._last_net_io = None self._ipython_kernel = None if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self, io_loop=self.loop) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = {'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.host_health, 'upload_file': self.upload_file, 'start_ipython': self.start_ipython, 'keys': self.keys, } super(Worker, self).__init__(handlers, io_loop=self.loop, **kwargs) self.heartbeat_callback = PeriodicCallback(self.heartbeat, self.heartbeat_interval, io_loop=self.loop) self.loop.add_callback(self.heartbeat_callback.start) @property def worker_address(self): """ For API compatibility with Nanny """ return self.address @gen.coroutine def heartbeat(self): if not self.heartbeat_active: self.heartbeat_active = True logger.debug("Heartbeat: %s" % self.address) try: yield self.scheduler.register(address=self.address, name=self.name, ncores=self.ncores, now=time(), host_info=self.host_health(), services=self.service_ports, memory_limit=self.memory_limit, **self.process_health()) finally: self.heartbeat_active = False else: logger.debug("Heartbeat skipped: channel busy") @gen.coroutine def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.scheduler.ip, self.scheduler.port) while True: try: resp = yield self.scheduler.register( ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), name=self.name, nbytes=valmap(sizeof, self.data), now=time(), host_info=self.host_health(), services=self.service_ports, memory_limit=self.memory_limit, **self.process_health()) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.scheduler.ip, self.scheduler.port) self.status = 'running' def start(self, port=0): self.loop.add_callback(self._start, port) def identity(self, stream): return {'type': type(self).__name__, 'id': self.id, 'scheduler': (self.scheduler.ip, self.scheduler.port), 'ncores': self.ncores, 'memory_limit': self.memory_limit} @gen.coroutine def _close(self, report=True, timeout=10): self.heartbeat_callback.stop() with ignoring(RPCClosed, StreamClosedError): if report: yield gen.with_timeout(timedelta(seconds=timeout), self.scheduler.unregister(address=(self.ip, self.port)), io_loop=self.loop) self.scheduler.close_rpc() self.stop() self.executor.shutdown() if os.path.exists(self.local_dir): shutil.rmtree(self.local_dir) for k, v in self.services.items(): v.stop() self.status = 'closed' self.stop() @gen.coroutine def terminate(self, stream, report=True): yield self._close(report=report) raise Return('OK') @property def address(self): return '%s:%d' % (self.ip, self.port) @property def address_tuple(self): return (self.ip, self.port) @gen.coroutine def gather(self, stream=None, who_has=None): who_has = {k: [coerce_to_address(addr) for addr in v] for k, v in who_has.items() if k not in self.data} try: result = yield gather_from_workers(who_has) except KeyError as e: logger.warn("Could not find data", e) raise Return({'status': 'missing-data', 'keys': e.args}) else: self.data.update(result) raise Return({'status': 'OK'}) def deserialize(self, function=None, args=None, kwargs=None, task=None): """ Deserialize task inputs and regularize to func, args, kwargs """ if task is not None: task = loads(task) if function is not None: function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) if task is not None: assert not function and not args and not kwargs function = execute_task args = (task,) return function, args or (), kwargs or {} @gen.coroutine def gather_many(self, msgs): """ Gather the data for many compute messages at once Returns ------- good: the input messages for which we have data bad: a dict of task keys for which we could not find data data: The scope in which to run tasks len(remote): the number of new keys we've gathered """ diagnostics = {} who_has = merge(msg['who_has'] for msg in msgs if 'who_has' in msg) start = time() local = {k: self.data[k] for k in who_has if k in self.data} stop = time() if stop - start > 0.005: diagnostics['disk_load_start'] = start diagnostics['disk_load_stop'] = stop who_has = {k: v for k, v in who_has.items() if k not in local} start = time() remote, bad_data = yield gather_from_workers(who_has, permissive=True) if remote: self.data.update(remote) yield self.scheduler.add_keys(address=self.address, keys=list(remote)) stop = time() if remote: diagnostics['transfer_start'] = start diagnostics['transfer_stop'] = stop data = merge(local, remote) if bad_data: missing = {msg['key']: {k for k in msg['who_has'] if k in bad_data} for msg in msgs if 'who_has' in msg} bad = {k: v for k, v in missing.items() if v} good = [msg for msg in msgs if not missing.get(msg['key'])] else: good, bad = msgs, {} raise Return([good, bad, data, len(remote), diagnostics]) @gen.coroutine def _ready_task(self, function=None, key=None, args=(), kwargs={}, task=None, who_has=None): who_has = who_has or {} diagnostics = {} start = time() data = {k: self.data[k] for k in who_has if k in self.data} stop = time() if stop - start > 0.005: diagnostics['disk_load_start'] = start diagnostics['disk_load_stop'] = stop who_has = {k: set(map(coerce_to_address, v)) for k, v in who_has.items() if k not in self.data} if who_has: try: logger.info("gather %d keys from peers", len(who_has)) diagnostics['transfer_start'] = time() other = yield gather_from_workers(who_has) diagnostics['transfer_stop'] = time() self.data.update(other) yield self.scheduler.add_keys(address=self.address, keys=list(other)) data.update(other) except KeyError as e: logger.warn("Could not find data for %s", key) raise Return({'status': 'missing-data', 'keys': e.args, 'key': key}) try: start = default_timer() function, args, kwargs = self.deserialize(function, args, kwargs, task) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) emsg = error_message(e) emsg['key'] = key raise Return(emsg) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) raise Return({'status': 'OK', 'function': function, 'args': args2, 'kwargs': kwargs2, 'diagnostics': diagnostics, 'key': key}) @gen.coroutine def executor_submit(self, key, function, *args, **kwargs): """ Safely run function in thread pool executor We've run into issues running concurrent.future futures within tornado. Apparently it's advantageous to use timeouts and periodic callbacks to ensure things run smoothly. This can get tricky, so we pull it off into an separate method. """ job_counter[0] += 1 # logger.info("%s:%d Starts job %d, %s", self.ip, self.port, i, key) future = self.executor.submit(function, *args, **kwargs) pc = PeriodicCallback(lambda: logger.debug("future state: %s - %s", key, future._state), 1000, io_loop=self.loop); pc.start() try: yield future finally: pc.stop() pass result = future.result() # logger.info("Finish job %d, %s", i, key) raise gen.Return(result) @gen.coroutine def compute_stream(self, stream): with log_errors(): logger.debug("Open compute stream") bstream = BatchedSend(interval=2, loop=self.loop) bstream.start(stream) closed = False last = gen.sleep(0) while not closed: try: msgs = yield read(stream) except StreamClosedError: break if not isinstance(msgs, list): msgs = [msgs] batch = [] for msg in msgs: op = msg.pop('op', None) if op == 'close': closed = True break elif op == 'compute-task': batch.append(msg) logger.debug("%s asked to compute %s", self.address, msg['key']) else: logger.warning("Unknown operation %s, %s", op, msg) # self.loop.add_callback(self.compute_many, bstream, msgs) last = self.compute_many(bstream, msgs) try: yield last # TODO: there might be more than one lingering except (RPCClosed, RuntimeError): pass yield bstream.close() logger.info("Close compute stream") @gen.coroutine def compute_many(self, bstream, msgs, report=False): good, bad, data, num_transferred, diagnostics = yield self.gather_many(msgs) for msg in msgs: msg.pop('who_has', None) if bad: logger.warn("Could not find data for %s", sorted(bad)) for k, v in bad.items(): bstream.send({'status': 'missing-data', 'key': k, 'keys': list(v)}) if good: futures = [self.compute_one(data, report=report, **msg) for msg in good] wait_iterator = gen.WaitIterator(*futures) result = yield wait_iterator.next() if diagnostics: result.update(diagnostics) bstream.send(result) while not wait_iterator.done(): msg = yield wait_iterator.next() bstream.send(msg) @gen.coroutine def compute_one(self, data, key=None, function=None, args=None, kwargs=None, report=False, task=None): logger.debug("Compute one on %s", key) self.active.add(key) diagnostics = dict() try: start = default_timer() function, args, kwargs = self.deserialize(function, args, kwargs, task) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) emsg = error_message(e) emsg['key'] = key raise Return(emsg) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args2, kwargs2, self.execution_state, key) result['key'] = key result.update(diagnostics) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.scheduler.add_keys(keys=[key], address=(self.ip, self.port)) if not response == 'OK': logger.warn('Could not report results to scheduler: %s', str(response)) else: logger.warn(" Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], convert_args_to_str(args, max_len=1000), convert_kwargs_to_str(kwargs, max_len=1000), exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, result) try: self.active.remove(key) except KeyError: pass raise Return(result) @gen.coroutine def compute(self, stream=None, function=None, key=None, args=(), kwargs={}, task=None, who_has=None, report=True): """ Execute function """ self.active.add(key) # Ready function for computation msg = yield self._ready_task(function=function, key=key, args=args, kwargs=kwargs, task=task, who_has=who_has) if msg['status'] != 'OK': try: self.active.remove(key) except KeyError: pass raise Return(msg) else: function = msg['function'] args = msg['args'] kwargs = msg['kwargs'] # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args, kwargs, self.execution_state, key) result['key'] = key result.update(msg['diagnostics']) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.scheduler.add_keys(address=(self.ip, self.port), keys=[key]) if not response == 'OK': logger.warn('Could not report results to scheduler: %s', str(response)) else: logger.warn(" Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], convert_args_to_str(args, max_len=1000), convert_kwargs_to_str(kwargs, max_len=1000), exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, get_msg_safe_str(msg)) try: self.active.remove(key) except KeyError: pass raise Return(result) def run(self, stream, function=None, args=(), kwargs={}): return run(self, stream, function=function, args=args, kwargs=kwargs) @gen.coroutine def update_data(self, stream=None, data=None, report=True, deserialize=True): if deserialize: data = valmap(loads, data) self.data.update(data) if report: response = yield self.scheduler.add_keys( address=(self.ip, self.port), keys=list(data)) assert response == 'OK' info = {'nbytes': {k: sizeof(v) for k, v in data.items()}, 'status': 'OK'} raise Return(info) @gen.coroutine def delete_data(self, stream, keys=None, report=True): if keys: for key in keys: if key in self.data: del self.data[key] logger.info("Deleted %d keys", len(keys)) if report: logger.debug("Reporting loss of keys to scheduler") yield self.scheduler.remove_keys(address=self.address, keys=list(keys)) raise Return('OK') def get_data(self, stream, keys=None): return {k: dumps(self.data[k]) for k in keys if k in self.data} def start_ipython(self, stream): """Start an IPython kernel Returns Jupyter connection info dictionary. """ from ._ipython_utils import start_ipython if self._ipython_kernel is None: self._ipython_kernel = start_ipython( ip=self.ip, ns={'worker': self}, log=logger, ) return self._ipython_kernel.get_connection_info() def upload_file(self, stream, filename=None, data=None, load=True): out_filename = os.path.join(self.local_dir, filename) if isinstance(data, unicode): data = data.encode() with open(out_filename, 'wb') as f: f.write(data) f.flush() if load: try: name, ext = os.path.splitext(filename) if ext in ('.py', '.pyc'): logger.info("Reload module %s from .py file", name) name = name.split('-')[0] reload(import_module(name)) if ext == '.egg': sys.path.append(out_filename) pkgs = pkg_resources.find_distributions(out_filename) for pkg in pkgs: logger.info("Load module %s from egg", pkg.project_name) reload(import_module(pkg.project_name)) if not pkgs: logger.warning("Found no packages in egg file") except Exception as e: logger.exception(e) return {'status': 'error', 'exception': dumps(e)} return {'status': 'OK', 'nbytes': len(data)} def process_health(self, stream=None): d = {'active': len(self.active), 'stored': len(self.data)} return d def host_health(self, stream=None): """ Information about worker """ d = {'time': time()} try: import psutil mem = psutil.virtual_memory() d.update({'cpu': psutil.cpu_percent(), 'memory': mem.total, 'memory_percent': mem.percent}) net_io = psutil.net_io_counters() if self._last_net_io: d['network-send'] = net_io.bytes_sent - self._last_net_io.bytes_sent d['network-recv'] = net_io.bytes_recv - self._last_net_io.bytes_recv else: d['network-send'] = 0 d['network-recv'] = 0 self._last_net_io = net_io try: disk_io = psutil.disk_io_counters() except RuntimeError: # This happens when there is no physical disk in worker pass else: if self._last_disk_io: d['disk-read'] = disk_io.read_bytes - self._last_disk_io.read_bytes d['disk-write'] = disk_io.write_bytes - self._last_disk_io.write_bytes else: d['disk-read'] = 0 d['disk-write'] = 0 self._last_disk_io = disk_io except ImportError: pass return d def keys(self, stream=None): return list(self.data)