def shutdown_worker(actor_id, worker_id, delete_actor_ch=True): """Gracefully shutdown a single worker." actor_id (str) - the dbid of the associated actor. """ logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id)) # set the worker status to SHUTDOWN_REQUESTED: try: Worker.update_worker_status(actor_id, worker_id, SHUTDOWN_REQUESTED) except Exception as e: logger.error(f"worker got exception trying to update status to SHUTODWN_REQUESTED. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") ch = WorkerChannel(worker_id=worker_id) if not delete_actor_ch: ch.put("stop-no-delete") else: ch.put("stop") logger.info("A 'stop' message was sent to worker: {}".format(worker_id)) ch.close()
def client_generation(self, actor_id, worker_id, tenant): client_ch = ClientsChannel() try: client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, secret=self.secret ) except Exception as e: logger.error("Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e)) # put worker in an error state and return self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") client_ch.close() Worker.update_worker_status(actor_id, worker_id, ERROR) logger.critical("Client generation FAILED.") raise e client_ch.close() if client_msg.get('status') == 'error': logger.error("Error generating client: {}".format(client_msg.get('message'))) self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") Worker.update_worker_status(actor_id, worker_id, ERROR) raise SpawnerException("Error generating client") #TODO - clean up error message # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) return client_msg['client_id'], \ client_msg['access_token'], \ client_msg['refresh_token'], \ client_msg['api_server'], \ client_msg['client_secret']
def execute_actor(actor_id, worker_id, worker_ch, image, msg, d={}, privileged=False): """ Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption from the Docker daemon. :param actor_id: the dbid of the actor; for updating worker status :param worker_id: the worker id; also for updating worker status :param worker_ch: NO LONGER USED. :param image: the actor's image; worker must have already downloaded this image to the local docker registry. :param msg: the message being passed to the actor. :param d: dictionary representing the environment to instantiate within the actor container. :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the docker daemon mounted. :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs. """ # result = {'cpu': 0, 'io': 0, 'runtime': 0} cli = docker.AutoVersionClient(base_url=dd) d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. if privileged: binds = { '/var/run/docker.sock': { 'bind': '/var/run/docker.sock', 'ro': False } } volumes = ['/var/run/docker.sock'] host_config = cli.create_host_config(binds=binds, privileged=privileged) container = cli.create_container(image=image, environment=d, volumes=volumes, host_config=host_config) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debig raise DockerStartContainerError( "Could not start container {}. Exception {}".format( container.get('Id'), str(e))) start = timeit.default_timer() Worker.update_worker_status(actor_id, worker_id, BUSY) running = True # create a separate cli for checkin stats objects since these should be fast and we don't want to wait stats_cli = docker.AutoVersionClient(base_url=dd, timeout=1) try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) except ReadTimeout: # if the container execution is so fast that the initial stats object cannot be created, # we skip the running loop and return a minimal stats object result['cpu'] = 1 result['runtime'] = 1 return result while running: try: print("waiting on a stats obj: {}".format(timeit.default_timer())) stats = next(stats_obj) except ReadTimeoutError: print("next(stats) just timed out: {}".format( timeit.default_timer())) # container stopped before another stats record could be read, just ignore and move on running = False break try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] result['io'] += stats['network']['rx_bytes'] except KeyError: # as of docker 1.9, the stats object returns bytes that must be decoded # and the network key is now 'networks' with multiple subkeys. print("got a stats obj: {}".format(timeit.default_timer())) if type(stats) == bytes: stats = json.loads(stats.decode("utf-8")) result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] # even running docker 1.9, there seems to be a race condition where the 'networks' key doesn't # always get populated. try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError: pass print("Recorded a stats obj:".format(timeit.default_timer())) if running: try: print("waiting on cli.wait: {}".format(timeit.default_timer())) cli.wait(container=container.get('Id'), timeout=1) print("container finished: {}".format(timeit.default_timer())) running = False except ReadTimeout: print("cli.wait just timed out: {}".format( timeit.default_timer())) # the wait timed out so check if we are beyond the max_run_time runtime = timeit.default_timer() - start if max_run_time > 0 and max_run_time < runtime: print("hit runtime limit: {}".format( timeit.default_timer())) cli.stop(container.get('Id')) running = False print("container stopped:{}".format(timeit.default_timer())) stop = timeit.default_timer() # get info from container execution, including exit code try: container_info = cli.inspect_container(container.get('Id')) try: container_state = container_info['State'] try: exit_code = container_state['ExitCode'] except KeyError: print( "Could not determine ExitCode for container {} in ".format( container.get('Id'))) exit_code = 'undetermined' except KeyError: print( "Could not determine final state for container {} in ".format( container.get('Id'))) container_state = {'unavailable': True} except docker.errors.APIError as e: print("Could not inspect container {}".format(container.get('Id'))) # get logs from container logs = cli.logs(container.get('Id')) # remove container, ignore errors try: cli.remove_container(container=container) print("Container removed.") except Exception as e: print("Exception trying to remove actor: {}".format(e)) result['runtime'] = int(stop - start) return result, logs, container_state, exit_code
def start_worker(self, image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret): # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 # worker = get_worker(worker_id) # worker['status'] = PULLING_IMAGE Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE) try: logger.debug("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) raise e logger.info("Image {} pulled successfully.".format(image)) # Done pulling image # Run Worker Container while True: try: Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER) logger.debug('spawner creating worker container') worker_dict = run_worker( image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret ) logger.debug(f'finished run worker; worker dict: {worker_dict}') except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) # todo - should we be calling kill_worker here? (it is called in the exception block of the else below) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break logger.debug('finished loop') worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) # if the actor is not already in READY status, set actor status to READY before worker status has been # set to READY. # it is possible the actor status is already READY because this request is the autoscaler starting a new worker # for an existing actor. actor = Actor.from_db(actors_store[actor_id]) if not actor.status == READY: try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass # finalize worker with READY status worker = Worker(tenant=tenant, **worker_dict) logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) ch.put('READY') # step 4 logger.info('sent message through channel')
def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("top of process; cmd: {}".format(cmd)) actor_id = cmd['actor_id'] try: actor = Actor.from_db(actors_store[actor_id]) except Exception as e: msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}" logger.error(msg) return worker_id = cmd['worker_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = 1 logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}" "stop_existing: {} num_workers: {}".format(actor_id, worker_id, image, tenant, stop_existing, num_workers)) # if the worker was sent a delete request before spawner received this message to create the worker, # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the # worker from the collection. try: logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED") worker = Worker.get_worker(actor_id, worker_id) logger.debug(f"spawner got worker; worker: {worker}") except Exception as e: logger.error(f"spawner got exception trying to retrieve worker. " f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return status = worker.get('status') if not status == REQUESTED: logger.debug(f"worker was NOT in REQUESTED status. status: {status}") if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR: logger.debug(f"worker status was {status}; spawner deleting worker and returning..") try: Worker.delete_worker(actor_id, worker_id) logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.") return except Exception as e: logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status." f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return else: logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.") return # worker status was REQUESTED; moving on to SPAWNER_SETUP ---- Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP) logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id)) client_id = None client_secret = None client_access_token = None client_refresh_token = None api_server = None client_secret = None # ---- Oauth client generation for the worker ------- # check if tenant and instance configured for client generation - try: generate_clients = Config.get('workers', f'{tenant}_generate_clients').lower() except: logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.") generate_clients = Config.get('workers', 'generate_clients').lower() logger.debug(f"final generate_clients: {generate_clients}") if generate_clients == "true": logger.debug("client generation was configured to be available; now checking the actor's token attr.") # updated 1.3.0-- check whether the actor requires a token: if actor.token: logger.debug("spawner starting client generation") client_id, \ client_access_token, \ client_refresh_token, \ api_server, \ client_secret = self.client_generation(actor_id, worker_id, tenant) else: logger.debug("actor's token attribute was False. Not generating client.") ch = SpawnerWorkerChannel(worker_id=worker_id) logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id)) try: worker = self.start_worker( image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret ) except Exception as e: msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e) logger.error(msg) self.error_out_actor(actor_id, worker_id, msg) if client_id: self.delete_client(tenant, actor_id, worker_id, client_id, client_secret) return logger.debug("Returned from start_worker; Created new worker: {}".format(worker)) ch.close() logger.debug("Client channel closed") if stop_existing: logger.info("Stopping existing workers: {}".format(worker_id)) # TODO - update status to stop_requested self.stop_workers(actor_id, [worker_id])
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.info("No leave_containers value confiured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.info("leave_containers: {}".format(leave_containers)) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # shared global tracking whether this worker should keep running; shared between this thread and # the "worker channel processing" thread. global keep_running # main subscription loop -- processing messages from actor's mailbox while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error("unexpected exception from call to update_worker_status." "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id, worker_id, BUSY, e)) raise e update_worker_status = True logger.info("Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No socket_host_path configured. Cannot manage results data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.") continue socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro'}) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No fifo_host_path configured. Cannot manage binary data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.") continue fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error("Could not create fifo_path. Exception: {}".format(e)) raise e # add the fifo as a mount: mounts.append({'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro'}) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: logger.error("Got an exception trying to get an access token: {}".format(e)) else: logger.info("Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code, start_time = execute_actor(actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) continue # Add the completed stats to the execution logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info("worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running)) if keep_running: logger.error("worker couldn't update's its execution time but keep_running is still true!") logger.info("worker time stamps updated.")
def subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format(str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: print("Got an exception trying to get an access token: {}".format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs = execute_actor(actor_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_ch.name)
def execute_actor(actor_id, worker_ch, image, msg, d={}, privileged=False): result = {'cpu': 0, 'io': 0, 'runtime': 0 } cli = docker.AutoVersionClient(base_url=dd) d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. if privileged: binds = {'/var/run/docker.sock':{ 'bind': '/var/run/docker.sock', 'ro': False }} volumes = ['/var/run/docker.sock'] host_config = cli.create_host_config(binds=binds, privileged=privileged) container = cli.create_container(image=image, environment=d, volumes=volumes, host_config=host_config) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debig raise DockerStartContainerError("Could not start container {}. Exception {}".format(container.get('Id'), str(e))) start = timeit.default_timer() Worker.update_worker_status(actor_id, worker_ch.name, BUSY) running = True # create a separate cli for checkin stats objects since these should be fast and we don't want to wait stats_cli = docker.AutoVersionClient(base_url=dd, timeout=1) try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) except ReadTimeout: # if the container execution is so fast that the inital stats object cannot be created, # we skip the running loop and return a minimal stats object result['cpu'] = 1 result['runtime'] = 1 return result while running: try: print("waiting on a stats obj: {}".format(timeit.default_timer())) stats = next(stats_obj) except ReadTimeoutError: print("next(stats) just timed out: {}".format(timeit.default_timer())) # container stopped before another stats record could be read, just ignore and move on running = False break try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] result['io'] += stats['network']['rx_bytes'] except KeyError: # as of docker 1.9, the stats object returns bytes that must be decoded # and the network key is now 'networks' with multiple subkeys. print("got a stats obj: {}".format(timeit.default_timer())) if type(stats) == bytes: stats = json.loads(stats.decode("utf-8")) result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] # even running docker 1.9, there seems to be a race condition where the 'networks' key doesn't # always get populated. try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError: pass print("Recorded a stats obj:".format(timeit.default_timer())) if running: try: print("waiting on cli.wait: {}".format(timeit.default_timer())) cli.wait(container=container.get('Id'), timeout=1) print("container finished: {}".format(timeit.default_timer())) running = False except ReadTimeout: print("cli.wait just timed out: {}".format(timeit.default_timer())) # the wait timed out so check if we are beyond the max_run_time runtime = timeit.default_timer() - start if max_run_time > 0 and max_run_time < runtime: print("hit runtime limit: {}".format(timeit.default_timer())) cli.stop(container.get('Id')) running = False print("container stopped:{}".format(timeit.default_timer())) stop = timeit.default_timer() # get logs from container logs = cli.logs(container.get('Id')) # remove container, ignore errors try: cli.remove_container(container=container) print("Container removed.") except Exception as e: print("Exception trying to remove actor: {}".format(e)) result['runtime'] = int(stop - start) return result, logs
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while keep_running: msg, msg_obj = worker_ch.get_one() # receiving the message is enough to ack it - resiliency is currently handled in the calling code. msg_obj.ack() logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) ch.delete() # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the event queue on this anonymous channel. this should be fixed in channelpy. # ch._queue._event_queue elif msg == 'force_quit': logger.info( "Worker with worker_id: {} (actor_id: {}) received a force_quit message, " "forcing the execution to halt...".format(worker_id, actor_id)) globals.force_quit = True elif msg == 'stop' or msg == 'stop-no-delete': logger.info( "Worker with worker_id: {} (actor_id: {}) received stop message, " "stopping worker...".format(worker_id, actor_id)) # set the worker status to SHUTTING_DOWN: try: Worker.update_worker_status(actor_id, worker_id, SHUTTING_DOWN) except Exception as e: logger.error( f"worker got exception trying to update status to SHUTTING_DOWN. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") globals.keep_running = False # when an actor's image is updated, old workers are deleted while new workers are # created. Deleting the actor msg channel in this case leads to race conditions delete_actor_ch = True if msg == 'stop-no-delete': logger.info("Got stop-no-delete; will not delete actor_ch.") delete_actor_ch = False # if a `stop` was sent, the actor is being deleted, and so we want to immediately shutdown processing. else: globals.force_quit = True # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info( "Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format( worker_id, ag_client.api_key)) else: logger.error( "Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format( worker_id, msg['message'], ag_client.api_key)) clients_ch.close() else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) # delete associated channels: # it is possible the actor channel was already deleted, in which case we just keep processing if delete_actor_ch: try: actor_ch.delete() logger.info( "ActorChannel deleted for actor: {} worker_id: {}". format(actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting ActorChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) try: worker_ch.delete() logger.info( "WorkerChannel deleted for actor: {} worker_id: {}".format( actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting WorkerChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) logger.info( "Worker with worker_id: {} is now exiting.".format(worker_id)) _thread.interrupt_main() logger.info("main thread interrupted, issuing os._exit()...") os._exit(0)
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe(). worker_id: {}".format(worker_id)) actor_ch = ActorMsgChannel(actor_id) # establish configs for this worker ------- try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.debug("No leave_containers value configured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.debug("leave_containers: {}".format(leave_containers)) try: mem_limit = Config.get('workers', 'mem_limit') except configparser.NoOptionError: logger.debug("No mem_limit value configured.") mem_limit = "-1" mem_limit = str(mem_limit) try: max_cpus = Config.get('workers', 'max_cpus') except configparser.NoOptionError: logger.debug("No max_cpus value configured.") max_cpus = "-1" logger.debug("max_cpus: {}".format(max_cpus)) # instantiate an OAuth client python object if credentials were passed ----- ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") # start a separate thread for handling messages sent to the worker channel ---- logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() # subscribe to the actor message queue ----- logger.info( "Worker subscribing to actor channel. worker_id: {}".format(worker_id)) # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # global tracks whether this worker should keep running. globals.keep_running = True # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a # message. Even though the message will be requeued, we do not want the worker to continue processing # indefinitely when a compute node is unhealthy. consecutive_errors = 0 # main subscription loop -- processing messages from actor's mailbox while globals.keep_running: logger.debug("top of keep_running; worker id: {}".format(worker_id)) if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) logger.debug( "updated worker status to READY in SUBSCRIBE; worker id: {}". format(worker_id)) update_worker_status = False try: msg, msg_obj = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting. worker id: {}".format( worker_id)) globals.keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error( "unexpected exception from call to update_worker_status. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) logger.info("worker exiting. worker_id: {}".format(worker_id)) msg_obj.nack(requeue=True) raise e update_worker_status = True logger.info( "Received message {}. Starting actor container. worker id: {}". format(msg, worker_id)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') try: actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) except Exception as e: logger.error( "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No socket_host_path configured. Cannot manage results data. Nacking message" ) Actor.set_status( actor_id, ERROR, status_message="Abaco instance not configured for results data." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e socket_host_path = '{}.sock'.format( os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({ 'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro' }) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No fifo_host_path configured. Cannot manage binary data.") Actor.set_status( actor_id, ERROR, status_message= "Abaco instance not configured for binary data. Nacking message." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error( "Could not create fifo_path. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # add the fifo as a mount: mounts.append({ 'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro' }) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug( "Adding worker_id to execution. woker_id: {}".format(worker_id)) try: Execution.add_worker_id(actor_id, execution_id, worker_id) except Exception as e: logger.error( "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}; worker_id: {}".format( privileged, worker_id)) # overlay resource limits if set on actor: if actor.mem_limit: mem_limit = actor.mem_limit if actor.max_cpus: max_cpus = actor.max_cpus # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_worker_id'] = worker_id environment['_abaco_container_repo'] = actor.image environment['_abaco_actor_state'] = actor.state environment['_abaco_actor_name'] = actor.name or 'None' logger.debug("Overlayed environment: {}; worker_id: {}".format( environment, worker_id)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token. Stoping worker and nacking message. " "Exception: {}".format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e else: logger.info( "Agave client `ag` is None -- not passing access token; worker_id: {}" .format(worker_id)) logger.info("Passing update environment: {}".format(environment)) logger.info("About to execute actor; worker_id: {}".format(worker_id)) try: stats, logs, final_state, exit_code, start_time = execute_actor( actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path, mem_limit, max_cpus) except DockerStartContainerError as e: logger.error( "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}." "Placing message back on queue.".format( worker_id, e, execution_id)) # if we failed to start the actor container, we leave the worker up and re-queue the original message msg_obj.nack(requeue=True) logger.debug('message requeued.') consecutive_errors += 1 if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS: logger.error( "Worker {} failed to successfully start actor for execution {} {} consecutive times; " "Exception: {}. Putting the actor in error status and shutting " "down workers.".format(worker_id, execution_id, MAX_WORKER_CONSECUTIVE_ERRORS, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}; w".format(e)) shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break else: # sleep five seconds before getting a message again to give time for the compute # node and/or docker health to recover time.sleep(5) continue except DockerStopContainerError as e: logger.error( "Worker {} was not able to stop actor for execution: {}; Exception: {}. " "Putting the actor in error status and shutting down workers.". format(worker_id, execution_id, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # since the error was with stopping the actor, we will consider this message "processed"; this choice # could be reconsidered/changed msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break except Exception as e: logger.error( "Worker {} got an unexpected exception trying to run actor for execution: {}." "Putting the actor in error status and shutting down workers. " "Exception: {}; type: {}".format(worker_id, execution_id, e, type(e))) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the # actor container; if the container was started, then another exception should be raised. Therefore, # we can assume here that the container was at least started and we can ack the message. msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break # ack the message msg_obj.ack() logger.debug( "container finished successfully; worker_id: {}".format(worker_id)) # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}; worker_id: {}".format( execution_id, worker_id)) # Add the logs to the execution try: Execution.set_logs(execution_id, logs) logger.debug("Successfully added execution logs.") except Exception as e: msg = "Got exception trying to set logs for exception {}; " \ "Exception: {}; worker_id: {}".format(execution_id, e, worker_id) logger.error(msg) # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) logger.debug("worker execution time updated. worker_id: {}".format( worker_id)) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info( "worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format( worker_id, globals.keep_running)) if globals.keep_running: logger.error( "worker couldn't update's its execution time but keep_running is still true!" ) # we completed an execution successfully; reset the consecutive_errors counter consecutive_errors = 0 logger.info( "worker time stamps updated; worker_id: {}".format(worker_id)) logger.info( "global.keep_running no longer true. worker is now exited. worker id: {}" .format(worker_id))
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") update_worker_status = True global keep_running while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() update_worker_status = True logger.info( "Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if actor['privileged'] == 'TRUE': privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token: {}". format(e)) else: logger.info( "Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code = execute_actor( actor_id, worker_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: Worker.update_worker_execution_time(actor_id, worker_id) logger.info("worker time stamps updated.")
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_id, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format( str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] Execution.add_worker_id(actor_id, execution_id, worker_id) privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: print("Got an exception trying to get an access token: {}". format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code = execute_actor( actor_id, worker_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_id)