def start_worker(self, image, tenant, worker_id): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name, worker_id) worker = Worker(tenant=tenant, **worker_dict) print( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "got an error back from the worker. Message: {}", format( result) print(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) print(msg) raise SpawnerException(msg)
def shutdown_worker(ch_name): """Gracefully shutdown a single worker.""" logger.debug("shutdown_worker called for ch_name: {}".format(ch_name)) ch = WorkerChannel(name=ch_name) ch.put("stop") logger.info( "A 'stop' message was sent to worker channel: {}".format(ch_name))
def delete(self, actor_id, ch_name): try: worker = get_worker(actor_id, ch_name) except WorkerException as e: raise APIException(e.message, 404) ch = WorkerChannel(name=ch_name) ch.put("stop") return ok(result=worker, msg="Worker scheduled to be stopped.")
def start_worker(self, image): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker = run_worker(image, ch._name) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException()
def start_worker(self, image, tenant): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name) worker = Worker(tenant=tenant, **worker_dict) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result["value"]["status"] == "ok": print("received ack from worker.") return ch, result["reply_to"], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException()
def start_worker(self, image, tenant, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, worker_id) worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() logger.debug( "Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}", format( result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error( "Spawner received invalid message from worker. 'msg' field missing. Message: {}" .format(result)) raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) logger.error( "Spawner received an invalid message from worker. Message: ". format(result)) raise SpawnerException(msg)
def shutdown_worker(worker_id, delete_actor_ch=True): """Gracefully shutdown a single worker.""" logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id)) ch = WorkerChannel(worker_id=worker_id) if not delete_actor_ch: ch.put("stop-no-delete") else: ch.put("stop") logger.info("A 'stop' message was sent to worker: {}".format(worker_id)) ch.close()
def main(worker_ch_name, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({'status': 'error', 'msg': str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({'status': 'ok'}) if result['status'] == 'error': print("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: print("Did not get client:yes, got client:{}".format(result.get('client'))) Actor.set_status(actor_id, READY) subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def main(worker_ch_name, worker_id, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({'status': 'error', 'msg': str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({'status': 'ok'}) if result['status'] == 'error': print("Worker received error message from spawner: {}. Quiting...". format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') print("Worker received ok from spawner. Message: {}, actor_id:{}".format( result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: print("Did not get client:yes, got client:{}".format( result.get('client'))) Actor.set_status(actor_id, READY) subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def main(): """ Main function for the worker process. This function """ worker_id = os.environ.get('worker_id') image = os.environ.get('image') actor_id = os.environ.get('actor_id') revision = os.environ.get('revision') try: revision = int(revision) except ValueError: logger.error(f"worker did not get an integer revision number; got: {revision}; " f"worker {actor_id}+{worker_id} exiting.") sys.exit() client_id = os.environ.get('client_id', None) client_access_token = os.environ.get('client_access_token', None) client_refresh_token = os.environ.get('client_refresh_token', None) tenant = os.environ.get('tenant', None) api_server = os.environ.get('api_server', None) client_secret = os.environ.get('client_secret', None) logger.info(f"Top of main() for worker: {worker_id}, image: {image}; revision: {revision}" f"actor_id: {actor_id}; client_id:{client_id}; tenant: {tenant}; api_server: {api_server}") spawner_worker_ch = SpawnerWorkerChannel(worker_id=worker_id) logger.debug("Worker waiting on message from spawner...") result = spawner_worker_ch.get_one() logger.debug("Worker received reply from spawner. result: {}.".format(result)) # should be OK to close the spawner_worker_ch on the worker side since spawner was first client # to open it. spawner_worker_ch.delete() logger.debug('spawner_worker_ch closed.') if not client_id: logger.info("Did not get client id.") else: logger.info(f"Got a client; client_id: {client_id}") logger.info(f"Actor {actor_id} status set to READY. subscribing to inbox.") worker_ch = WorkerChannel(worker_id=worker_id) subscribe(tenant, actor_id, image, revision, worker_id, api_server, client_id, client_secret, client_access_token, client_refresh_token, worker_ch)
def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug("workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format(len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) # since this is an update, there are new workers being started, so # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id'])) ch.close() else: logger.debug("skipping worker {} as it it not in worker_ids.".format(worker)) else: logger.info("No workers to stop.")
def shutdown_worker(worker_id): """Gracefully shutdown a single worker.""" logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id)) ch = WorkerChannel(worker_id=worker_id) ch.put("stop") logger.info("A 'stop' message was sent to worker: {}".format(worker_id)) ch.close()
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers = json.loads(workers_store[actor_id]) print("Found existing workers: {}".format(str(workers))) except KeyError: print("No existing workers.") workers = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers) > 0 : # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for worker in workers: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def start_worker(self, image, tenant, actor_id, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 while True: try: worker_dict = run_worker(image, actor_id, worker_id) except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info("worker started successfully, waiting on ack that image was pulled...") result = ch.get() logger.debug("Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}",format(result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result)) raise SpawnerException(message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format(str(result)) logger.error("Spawner received an invalid message from worker. Message: ".format(result)) raise SpawnerException(msg)
def main(worker_ch_name, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({"status": "error", "msg": str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({"status": "ok"}) if result["status"] == "error": print("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get("actor_id") print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) Actor.set_status(actor_id, READY) subscribe(actor_id, worker_ch)
def shutdown_worker(actor_id, worker_id, delete_actor_ch=True): """Gracefully shutdown a single worker." actor_id (str) - the dbid of the associated actor. """ logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id)) # set the worker status to SHUTDOWN_REQUESTED: try: Worker.update_worker_status(actor_id, worker_id, SHUTDOWN_REQUESTED) except Exception as e: logger.error(f"worker got exception trying to update status to SHUTODWN_REQUESTED. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") ch = WorkerChannel(worker_id=worker_id) if not delete_actor_ch: ch.put("stop-no-delete") else: ch.put("stop") logger.info("A 'stop' message was sent to worker: {}".format(worker_id)) ch.close()
def start_worker(self, image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret): # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 # worker = get_worker(worker_id) # worker['status'] = PULLING_IMAGE Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE) try: logger.debug("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) raise e logger.info("Image {} pulled successfully.".format(image)) # Done pulling image # Run Worker Container while True: try: Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER) logger.debug('spawner creating worker container') worker_dict = run_worker( image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret ) logger.debug(f'finished run worker; worker dict: {worker_dict}') except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) # todo - should we be calling kill_worker here? (it is called in the exception block of the else below) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break logger.debug('finished loop') worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) # if the actor is not already in READY status, set actor status to READY before worker status has been # set to READY. # it is possible the actor status is already READY because this request is the autoscaler starting a new worker # for an existing actor. actor = Actor.from_db(actors_store[actor_id]) if not actor.status == READY: try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass # finalize worker with READY status worker = Worker(tenant=tenant, **worker_dict) logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) ch.put('READY') # step 4 logger.info('sent message through channel')
def main(worker_id, image): """ Main function for the worker process. This function """ logger.info("Entering main() for worker: {}, image: {}".format( worker_id, image)) spawner_worker_ch = SpawnerWorkerChannel(worker_id=worker_id) # first, attempt to pull image from docker hub: try: logger.info("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) spawner_worker_ch.put({'status': 'error', 'msg': str(e)}) raise e logger.info("Image {} pulled successfully.".format(image)) # inform spawner that image pulled successfully and, simultaneously, # wait to receive message from spawner that it is time to subscribe to the actor channel logger.debug("Worker waiting on message from spawner...") result = spawner_worker_ch.put_sync({'status': 'ok'}) logger.info("Worker received reply from spawner. result: {}.".format(result)) # should be OK to close the spawner_worker_ch on the worker side since spawner was first client # to open it. spawner_worker_ch.close() if result['status'] == 'error': # we do not expect to get an error response at this point. this needs investigation logger.error("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') logger.info("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': logger.info("Got client: yes, result: {}".format(result)) api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: logger.info("Did not get client:yes, got result:{}".format(result)) try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass logger.info("Actor status set to READY. subscribing to inbox.") worker_ch = WorkerChannel(worker_id=worker_id) subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: worker_id = worker['id'] worker_status = worker.get('status') # if the worker has only been requested, it will not have a host_id. it is possible # the worker will ultimately get scheduled on a different host; however, if there is # some issue and the worker is "stuck" in the early phases, we should remove it.. if 'host_id' not in worker: # check for an old create time worker_create_t = worker.get('create_time') # in versions prior to 1.9, worker create_time was not set until after it was READY if not worker_create_t: hard_delete_worker( actor_id, worker_id, reason_str= 'Worker did not have a host_id or create_time field.') # if still no host after 5 minutes, delete it if worker_create_t < get_current_utc_time() - datetime.timedelta( minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker did not have a host_id and had ' 'old create_time field.') # ignore workers on different hosts because this health agent cannot interact with the # docker daemon responsible for the worker container.. if not host_id == worker['host_id']: continue # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN: worker_last_health_check_time = worker.get( 'last_health_check_time') if not worker_last_health_check_time: worker_last_health_check_time = worker.get('create_time') if not worker_last_health_check_time: hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN and no health checks.') elif worker_last_health_check_time < get_current_utc_time( ) - datetime.timedelta(minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN for too long.') # check if the worker has not responded to a health check recently; we use a relatively long period # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq. # this needs to be watched closely though... worker_last_health_check_time = worker.get('last_health_check_time') if not worker_last_health_check_time or \ (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)): hard_delete_worker( actor_id, worker_id, reason_str='Worker has not health checked for too long.') # first send worker a health check logger.info(f"sending worker {worker_id} a health check") ch = WorkerChannel(worker_id=worker_id) try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) ch.put('status') except (channelpy.exceptions.ChannelTimeoutException, Exception) as e: logger.error( f"Got exception of type {type(e)} trying to send worker {worker_id} a " f"health check. e: {e}") finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) # now check if the worker has been idle beyond the max worker_ttl configured for this abaco: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") continue # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def shutdown_worker(ch_name): """Gracefully shutdown a single worker.""" ch = WorkerChannel(name=ch_name) ch.put("stop")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not host_id == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') result = None try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) logger.info("worker {} deleted from store".format(worker_id)) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) # if the put_sync timed out and we removed the worker, we also need to delete the channel # otherwise the un-acked message will remain. try: ch.delete() except Exception as e: logger.error( "Got exception: {} while trying to delete worker channel for worker: {}" .format(e, worker_id)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if result and not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
# call the main() function: try: main() except Exception as e: try: worker_id = os.environ.get('worker_id') except: logger.error( f"worker main thread got exception trying to get worker id from environment." f"not able to send stop-no-delete message to itself." f"worker_id: {worker_id}.") worker_id = '' if worker_id: try: ch = WorkerChannel(worker_id=worker_id) # since this is an exception, we don't know that the actor has been deleted # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info( f"Worker main loop sent 'stop-no-delete' message to itself; worker_id: {worker_id}." ) ch.close() msg = "worker caught exception from main loop. worker exiting. e" \ "Exception: {} worker_id: {}".format(e, worker_id) logger.info(msg) except Exception as e: logger.error( f"worker main thread got exception trying to send stop-no-delete message to itself;" f"worker_id: {worker_id}.") keep_running = False
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor""" workers = get_workers(actor_id) for worker in workers: ch = WorkerChannel(name=worker["ch_name"]) ch.put("stop")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = int(float(worker.get('last_execution_time', 0))) if last_execution + ttl < time.time(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(worker['id']) else: logger.info("Still time left for this worker.") elif worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(worker['id']) else: logger.debug("Worker not in READY status, will postpone.")