def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': logger.info("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info("Delete request completed successfully.") else: logger.error("Error deleting client. Message: {}".format( msg['message'])) else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info( "Got WorkerException from delete_worker(). Exception: {}". format(e)) keep_running = False actor_ch.close() logger.info("Closing actor channel for actor: {}".format(actor_id)) logger.info("Worker is now exiting.") sys.exit()
def kill_worker(self, actor_id, worker_id): try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def kill_worker(self, actor_id, worker_id): logger.debug(f"top of kill_worker: {actor_id}_{worker_id}") try: Worker.delete_worker(actor_id, worker_id) logger.debug(f"worker deleted; {actor_id}_{worker_id}") except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def process_worker_ch(tenant, worker_ch, actor_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running print("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue print("Received message in worker channel: {}".format(msg)) print("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': print("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: print("Requesting client {} be deleted.".format(ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_ch.name, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': print("Delete request completed successfully.") else: print("Error deleting client. Message: {}".format(msg['message'])) else: print("Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_ch.name) except WorkerException: pass keep_running = False actor_ch.close() sys.exit()
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" workers = Worker.get_workers(actor_id) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): # if there is already a channel, tell the worker to shut itself down if 'ch_name' in worker: shutdown_worker(worker['ch_name']) else: # otherwise, just remove from db: try: worker_id = worker.get('id') Worker.delete_worker(actor_id, worker_id) except WorkerException as e: print( "Got a WorkerException trying to delete worker with id: {}; exception: {}" .format(worker_id, e))
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def hard_delete_worker(actor_id, worker_id, worker_container_id=None, reason_str=None): """ Hard delete of worker from the db. Will also try to hard remove the worker container id, if one is passed, but does not stop for errors. :param actor_id: db_id of the actor. :param worker_id: id of the worker :param worker_container_id: Docker container id of the worker container (optional) :param reason_str: The reason the worker is being hard deleted (optional, for the logs only). :return: None """ logger.error(f"Top of hard_delete_worker for actor_id: {actor_id}; " f"worker_id: {worker_id}; " f"worker_container_id: {worker_container_id};" f"reason: {reason_str}") # hard delete from worker db -- try: Worker.delete_worker(actor_id, worker_id) logger.info(f"worker {worker_id} deleted from store") except Exception as e: logger.error( f"Got exception trying to delete worker: {worker_id}; exception: {e}" ) # also try to delete container -- if worker_container_id: try: rm_container(worker_container_id) logger.info(f"worker {worker_id} container deleted from docker") except Exception as e: logger.error( f"Got exception trying to delete worker container; worker: {worker_id}; " f"container: {worker_container_id}; exception: {e}")
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" logger.debug("shutdown_workers() called for actor: {}".format(actor_id)) workers = Worker.get_workers(actor_id) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): # if there is already a channel, tell the worker to shut itself down if 'ch_name' in worker: logger.info( "worker had channel: {}. calling shutdown_worker() for worker: {}" .format(worker['ch_name'], worker.get('id'))) shutdown_worker(worker['ch_name']) else: # otherwise, just remove from db: try: logger.info( "worker: {} did not have a channel. Deleting worker.". format(worker.get('id'))) worker_id = worker.get('id') Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.error( "Got a WorkerException trying to delete worker with id: {}; exception: {}" .format(worker_id, e))
def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("top of process; cmd: {}".format(cmd)) actor_id = cmd['actor_id'] try: actor = Actor.from_db(actors_store[actor_id]) except Exception as e: msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}" logger.error(msg) return worker_id = cmd['worker_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = 1 logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}" "stop_existing: {} num_workers: {}".format(actor_id, worker_id, image, tenant, stop_existing, num_workers)) # if the worker was sent a delete request before spawner received this message to create the worker, # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the # worker from the collection. try: logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED") worker = Worker.get_worker(actor_id, worker_id) logger.debug(f"spawner got worker; worker: {worker}") except Exception as e: logger.error(f"spawner got exception trying to retrieve worker. " f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return status = worker.get('status') if not status == REQUESTED: logger.debug(f"worker was NOT in REQUESTED status. status: {status}") if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR: logger.debug(f"worker status was {status}; spawner deleting worker and returning..") try: Worker.delete_worker(actor_id, worker_id) logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.") return except Exception as e: logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status." f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return else: logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.") return # worker status was REQUESTED; moving on to SPAWNER_SETUP ---- Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP) logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id)) client_id = None client_secret = None client_access_token = None client_refresh_token = None api_server = None client_secret = None # ---- Oauth client generation for the worker ------- # check if tenant and instance configured for client generation - try: generate_clients = Config.get('workers', f'{tenant}_generate_clients').lower() except: logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.") generate_clients = Config.get('workers', 'generate_clients').lower() logger.debug(f"final generate_clients: {generate_clients}") if generate_clients == "true": logger.debug("client generation was configured to be available; now checking the actor's token attr.") # updated 1.3.0-- check whether the actor requires a token: if actor.token: logger.debug("spawner starting client generation") client_id, \ client_access_token, \ client_refresh_token, \ api_server, \ client_secret = self.client_generation(actor_id, worker_id, tenant) else: logger.debug("actor's token attribute was False. Not generating client.") ch = SpawnerWorkerChannel(worker_id=worker_id) logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id)) try: worker = self.start_worker( image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret ) except Exception as e: msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e) logger.error(msg) self.error_out_actor(actor_id, worker_id, msg) if client_id: self.delete_client(tenant, actor_id, worker_id, client_id, client_secret) return logger.debug("Returned from start_worker; Created new worker: {}".format(worker)) ch.close() logger.debug("Client channel closed") if stop_existing: logger.info("Stopping existing workers: {}".format(worker_id)) # TODO - update status to stop_requested self.stop_workers(actor_id, [worker_id])
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while True: msg = worker_ch.get_one() logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) ch.delete() # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the event queue on this anonymous channel. this should be fixed in channelpy. # ch._queue._event_queue elif msg == 'stop' or msg == 'stop-no-delete': logger.info("Worker with worker_id: {} (actor_id: {}) received stop message, " "stopping worker...".format(worker_id, actor_id)) # when an actor's image is updated, old workers are deleted while new workers are # created. Deleting the actor msg channel in this case leads to race conditions delete_actor_ch = True if msg == 'stop-no-delete': logger.info("Got stop-no-delete; will not delete actor_ch.") delete_actor_ch = False # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format(ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info("Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format(worker_id, ag_client.api_key)) else: logger.error("Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'], ag_client.api_key)) clients_ch.close() else: logger.info("Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) keep_running = False # delete associated channels: if delete_actor_ch: actor_ch.delete() worker_ch.delete() logger.info("WorkerChannel deleted and ActorMsgChannel closed for actor: {} worker_id: {}".format(actor_id, worker_id)) logger.info("Worker with worker_id: {} is now exiting.".format(worker_id)) _thread.interrupt_main() logger.info("main thread interruptted.") os._exit()
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not host_id == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') result = None try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) logger.info("worker {} deleted from store".format(worker_id)) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) # if the put_sync timed out and we removed the worker, we also need to delete the channel # otherwise the un-acked message will remain. try: ch.delete() except Exception as e: logger.error( "Got exception: {} while trying to delete worker channel for worker: {}" .format(e, worker_id)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if result and not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while keep_running: msg, msg_obj = worker_ch.get_one() # receiving the message is enough to ack it - resiliency is currently handled in the calling code. msg_obj.ack() logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) ch.delete() # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the event queue on this anonymous channel. this should be fixed in channelpy. # ch._queue._event_queue elif msg == 'force_quit': logger.info( "Worker with worker_id: {} (actor_id: {}) received a force_quit message, " "forcing the execution to halt...".format(worker_id, actor_id)) globals.force_quit = True elif msg == 'stop' or msg == 'stop-no-delete': logger.info( "Worker with worker_id: {} (actor_id: {}) received stop message, " "stopping worker...".format(worker_id, actor_id)) # set the worker status to SHUTTING_DOWN: try: Worker.update_worker_status(actor_id, worker_id, SHUTTING_DOWN) except Exception as e: logger.error( f"worker got exception trying to update status to SHUTTING_DOWN. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") globals.keep_running = False # when an actor's image is updated, old workers are deleted while new workers are # created. Deleting the actor msg channel in this case leads to race conditions delete_actor_ch = True if msg == 'stop-no-delete': logger.info("Got stop-no-delete; will not delete actor_ch.") delete_actor_ch = False # if a `stop` was sent, the actor is being deleted, and so we want to immediately shutdown processing. else: globals.force_quit = True # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info( "Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format( worker_id, ag_client.api_key)) else: logger.error( "Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format( worker_id, msg['message'], ag_client.api_key)) clients_ch.close() else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) # delete associated channels: # it is possible the actor channel was already deleted, in which case we just keep processing if delete_actor_ch: try: actor_ch.delete() logger.info( "ActorChannel deleted for actor: {} worker_id: {}". format(actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting ActorChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) try: worker_ch.delete() logger.info( "WorkerChannel deleted for actor: {} worker_id: {}".format( actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting WorkerChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) logger.info( "Worker with worker_id: {} is now exiting.".format(worker_id)) _thread.interrupt_main() logger.info("main thread interrupted, issuing os._exit()...") os._exit(0)
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = int(float(worker.get('last_execution_time', 0))) if last_execution + ttl < time.time(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(worker['id']) else: logger.info("Still time left for this worker.") elif worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(worker['id']) else: logger.debug("Worker not in READY status, will postpone.")