def get_agave(self, tenant, actor_owner): """ Generate an agavepy client representing a specific user owning an actor. The `actor_owner` should be the username associated with the owner of the actor. """ # these are the credentials of the abaco service account. this account should have the abaco and # impersonator roles. username = self.credentials[tenant.upper()]['username'] password = self.credentials[tenant.upper()]['password'] if username == '' or password == '': msg = 'Client service credentials not defined for tenant {}'.format( tenant) logger.error(msg) raise ClientException(msg) api_server = get_api_server(tenant) verify = get_tenant_verify(tenant) # generate an Agave client set up for admin_password representing the actor owner: logger.info("Attempting to generate an agave client.") try: return api_server, Agave(api_server=api_server, username=username, password=password, token_username=actor_owner, verify=verify) except Exception as e: msg = "Got exception trying to instantiate Agave object; exception: {}".format( e) logger.error(msg) raise ClientException(msg)
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.info("No leave_containers value confiured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.info("leave_containers: {}".format(leave_containers)) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # shared global tracking whether this worker should keep running; shared between this thread and # the "worker channel processing" thread. global keep_running # main subscription loop -- processing messages from actor's mailbox while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error("unexpected exception from call to update_worker_status." "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id, worker_id, BUSY, e)) raise e update_worker_status = True logger.info("Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No socket_host_path configured. Cannot manage results data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.") continue socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro'}) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No fifo_host_path configured. Cannot manage binary data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.") continue fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error("Could not create fifo_path. Exception: {}".format(e)) raise e # add the fifo as a mount: mounts.append({'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro'}) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: logger.error("Got an exception trying to get an access token: {}".format(e)) else: logger.info("Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code, start_time = execute_actor(actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) continue # Add the completed stats to the execution logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info("worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running)) if keep_running: logger.error("worker couldn't update's its execution time but keep_running is still true!") logger.info("worker time stamps updated.")
def clean_up_apim_clients(tenant): """Check the list of clients registered in APIM and remove any that are associated with retired workers.""" username = os.environ.get('_abaco_{}_username'.format(tenant), '') password = os.environ.get('_abaco_{}_password'.format(tenant), '') if not username: msg = "Health process did not get a username for tenant {}; " \ "returning from clean_up_apim_clients".format(tenant) if tenant in ['SD2E', 'TACC-PROD']: logger.error(msg) else: logger.info(msg) return None if not password: msg = "Health process did not get a password for tenant {}; " \ "returning from clean_up_apim_clients".format(tenant) if tenant in ['SD2E', 'TACC-PROD']: logger.error(msg) else: logger.info(msg) return None api_server = get_api_server(tenant) verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, username=username, password=password, verify=verify) logger.debug("health process created an ag for tenant: {}".format(tenant)) try: cs = ag.clients.list() clients = cs.json()['result'] except Exception as e: msg = "Health process got an exception trying to retrieve clients; exception: {}".format( e) logger.error(msg) return None for client in clients: # check if the name of the client is an abaco hash (i.e., a worker id). if not, we ignore it from the beginning name = client.get('name') if not is_hashid(name): logger.debug( "client {} is not an abaco hash id; skipping.".format(name)) continue # we know this client came from a worker, so we need to check to see if the worker is still active; # first check if the worker even exists; if it does, the id will be the client name: worker = get_worker(name) if not worker: logger.info( "no worker associated with id: {}; deleting client.".format( name)) delete_client(ag, name) logger.info("client {} deleted by health process.".format(name)) continue # if the worker exists, we should check the status: status = worker.get('status') if status == codes.ERROR: logger.info( "worker {} was in ERROR status so deleting client; worker: {}." .format(name, worker)) delete_client(ag, name) logger.info("client {} deleted by health process.".format(name)) else: logger.debug( "worker {} still active; not deleting client.".format(worker))
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe(). worker_id: {}".format(worker_id)) actor_ch = ActorMsgChannel(actor_id) # establish configs for this worker ------- try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.debug("No leave_containers value configured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.debug("leave_containers: {}".format(leave_containers)) try: mem_limit = Config.get('workers', 'mem_limit') except configparser.NoOptionError: logger.debug("No mem_limit value configured.") mem_limit = "-1" mem_limit = str(mem_limit) try: max_cpus = Config.get('workers', 'max_cpus') except configparser.NoOptionError: logger.debug("No max_cpus value configured.") max_cpus = "-1" logger.debug("max_cpus: {}".format(max_cpus)) # instantiate an OAuth client python object if credentials were passed ----- ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") # start a separate thread for handling messages sent to the worker channel ---- logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() # subscribe to the actor message queue ----- logger.info( "Worker subscribing to actor channel. worker_id: {}".format(worker_id)) # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # global tracks whether this worker should keep running. globals.keep_running = True # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a # message. Even though the message will be requeued, we do not want the worker to continue processing # indefinitely when a compute node is unhealthy. consecutive_errors = 0 # main subscription loop -- processing messages from actor's mailbox while globals.keep_running: logger.debug("top of keep_running; worker id: {}".format(worker_id)) if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) logger.debug( "updated worker status to READY in SUBSCRIBE; worker id: {}". format(worker_id)) update_worker_status = False try: msg, msg_obj = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting. worker id: {}".format( worker_id)) globals.keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error( "unexpected exception from call to update_worker_status. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) logger.info("worker exiting. worker_id: {}".format(worker_id)) msg_obj.nack(requeue=True) raise e update_worker_status = True logger.info( "Received message {}. Starting actor container. worker id: {}". format(msg, worker_id)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') try: actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) except Exception as e: logger.error( "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No socket_host_path configured. Cannot manage results data. Nacking message" ) Actor.set_status( actor_id, ERROR, status_message="Abaco instance not configured for results data." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e socket_host_path = '{}.sock'.format( os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({ 'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro' }) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No fifo_host_path configured. Cannot manage binary data.") Actor.set_status( actor_id, ERROR, status_message= "Abaco instance not configured for binary data. Nacking message." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error( "Could not create fifo_path. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # add the fifo as a mount: mounts.append({ 'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro' }) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug( "Adding worker_id to execution. woker_id: {}".format(worker_id)) try: Execution.add_worker_id(actor_id, execution_id, worker_id) except Exception as e: logger.error( "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}; worker_id: {}".format( privileged, worker_id)) # overlay resource limits if set on actor: if actor.mem_limit: mem_limit = actor.mem_limit if actor.max_cpus: max_cpus = actor.max_cpus # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_worker_id'] = worker_id environment['_abaco_container_repo'] = actor.image environment['_abaco_actor_state'] = actor.state environment['_abaco_actor_name'] = actor.name or 'None' logger.debug("Overlayed environment: {}; worker_id: {}".format( environment, worker_id)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token. Stoping worker and nacking message. " "Exception: {}".format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e else: logger.info( "Agave client `ag` is None -- not passing access token; worker_id: {}" .format(worker_id)) logger.info("Passing update environment: {}".format(environment)) logger.info("About to execute actor; worker_id: {}".format(worker_id)) try: stats, logs, final_state, exit_code, start_time = execute_actor( actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path, mem_limit, max_cpus) except DockerStartContainerError as e: logger.error( "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}." "Placing message back on queue.".format( worker_id, e, execution_id)) # if we failed to start the actor container, we leave the worker up and re-queue the original message msg_obj.nack(requeue=True) logger.debug('message requeued.') consecutive_errors += 1 if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS: logger.error( "Worker {} failed to successfully start actor for execution {} {} consecutive times; " "Exception: {}. Putting the actor in error status and shutting " "down workers.".format(worker_id, execution_id, MAX_WORKER_CONSECUTIVE_ERRORS, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}; w".format(e)) shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break else: # sleep five seconds before getting a message again to give time for the compute # node and/or docker health to recover time.sleep(5) continue except DockerStopContainerError as e: logger.error( "Worker {} was not able to stop actor for execution: {}; Exception: {}. " "Putting the actor in error status and shutting down workers.". format(worker_id, execution_id, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # since the error was with stopping the actor, we will consider this message "processed"; this choice # could be reconsidered/changed msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break except Exception as e: logger.error( "Worker {} got an unexpected exception trying to run actor for execution: {}." "Putting the actor in error status and shutting down workers. " "Exception: {}; type: {}".format(worker_id, execution_id, e, type(e))) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the # actor container; if the container was started, then another exception should be raised. Therefore, # we can assume here that the container was at least started and we can ack the message. msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break # ack the message msg_obj.ack() logger.debug( "container finished successfully; worker_id: {}".format(worker_id)) # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}; worker_id: {}".format( execution_id, worker_id)) # Add the logs to the execution try: Execution.set_logs(execution_id, logs) logger.debug("Successfully added execution logs.") except Exception as e: msg = "Got exception trying to set logs for exception {}; " \ "Exception: {}; worker_id: {}".format(execution_id, e, worker_id) logger.error(msg) # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) logger.debug("worker execution time updated. worker_id: {}".format( worker_id)) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info( "worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format( worker_id, globals.keep_running)) if globals.keep_running: logger.error( "worker couldn't update's its execution time but keep_running is still true!" ) # we completed an execution successfully; reset the consecutive_errors counter consecutive_errors = 0 logger.info( "worker time stamps updated; worker_id: {}".format(worker_id)) logger.info( "global.keep_running no longer true. worker is now exited. worker id: {}" .format(worker_id))