def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def hard_delete_worker(actor_id, worker_id, worker_container_id=None, reason_str=None): """ Hard delete of worker from the db. Will also try to hard remove the worker container id, if one is passed, but does not stop for errors. :param actor_id: db_id of the actor. :param worker_id: id of the worker :param worker_container_id: Docker container id of the worker container (optional) :param reason_str: The reason the worker is being hard deleted (optional, for the logs only). :return: None """ logger.error(f"Top of hard_delete_worker for actor_id: {actor_id}; " f"worker_id: {worker_id}; " f"worker_container_id: {worker_container_id};" f"reason: {reason_str}") # hard delete from worker db -- try: Worker.delete_worker(actor_id, worker_id) logger.info(f"worker {worker_id} deleted from store") except Exception as e: logger.error( f"Got exception trying to delete worker: {worker_id}; exception: {e}" ) # also try to delete container -- if worker_container_id: try: rm_container(worker_container_id) logger.info(f"worker {worker_id} container deleted from docker") except Exception as e: logger.error( f"Got exception trying to delete worker container; worker: {worker_id}; " f"container: {worker_container_id}; exception: {e}")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not host_id == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') result = None try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) logger.info("worker {} deleted from store".format(worker_id)) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) # if the put_sync timed out and we removed the worker, we also need to delete the channel # otherwise the un-acked message will remain. try: ch.delete() except Exception as e: logger.error( "Got exception: {} while trying to delete worker channel for worker: {}" .format(e, worker_id)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if result and not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = int(float(worker.get('last_execution_time', 0))) if last_execution + ttl < time.time(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(worker['id']) else: logger.info("Still time left for this worker.") elif worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(worker['id']) else: logger.debug("Worker not in READY status, will postpone.")