Exemple #1
0
 def get_agave(self, tenant, actor_owner):
     """
     Generate an agavepy client representing a specific user owning an actor.
     The `actor_owner` should be the username associated with the owner of the actor.
     """
     # these are the credentials of the abaco service account. this account should have the abaco and
     # impersonator roles.
     username = self.credentials[tenant.upper()]['username']
     password = self.credentials[tenant.upper()]['password']
     if username == '' or password == '':
         msg = 'Client service credentials not defined for tenant {}'.format(
             tenant)
         logger.error(msg)
         raise ClientException(msg)
     api_server = get_api_server(tenant)
     verify = get_tenant_verify(tenant)
     # generate an Agave client set up for admin_password representing the actor owner:
     logger.info("Attempting to generate an agave client.")
     try:
         return api_server, Agave(api_server=api_server,
                                  username=username,
                                  password=password,
                                  token_username=actor_owner,
                                  verify=verify)
     except Exception as e:
         msg = "Got exception trying to instantiate Agave object; exception: {}".format(
             e)
         logger.error(msg)
         raise ClientException(msg)
Exemple #2
0
def subscribe(tenant,
              actor_id,
              worker_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    logger.debug("Top of subscribe().")
    actor_ch = ActorMsgChannel(actor_id)
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.info("No leave_containers value confiured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.info("leave_containers: {}".format(leave_containers))
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag))
    t.start()
    logger.info("Worker subscribing to actor channel.")

    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # shared global tracking whether this worker should keep running; shared between this thread and
    # the "worker channel processing" thread.
    global keep_running

    # main subscription loop -- processing messages from actor's mailbox
    while keep_running:
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            update_worker_status = False
        try:
            msg = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))
        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error("unexpected exception from call to update_worker_status."
                         "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id,
                                                                                         worker_id,
                                                                                         BUSY,
                                                                                         e))
            raise e
        update_worker_status = True
        logger.info("Received message {}. Starting actor container...".format(msg))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        content_type = msg['_abaco_Content_Type']
        mounts = actor.mounts
        logger.debug("actor mounts: {}".format(mounts))
        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers', 'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError):
            logger.error("No socket_host_path configured. Cannot manage results data.")
            Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.")
            continue
        socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({'host_path': socket_host_path,
                       'container_path': '/_abaco_results.sock',
                       'format': 'ro'})
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir')
            except (configparser.NoSectionError, configparser.NoOptionError):
                logger.error("No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.")
                continue
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error("Could not create fifo_path. Exception: {}".format(e))
                raise e
            # add the fifo as a mount:
            mounts.append({'host_path': fifo_host_path,
                           'container_path': '/_abaco_binary_data',
                           'format': 'ro'})

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug("Adding worker_id to execution.")
        Execution.add_worker_id(actor_id, execution_id, worker_id)

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}".format(privileged))

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        logger.debug("Overlayed environment: {}".format(environment))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                logger.error("Got an exception trying to get an access token: {}".format(e))
        else:
            logger.info("Agave client `ag` is None -- not passing access token.")
        logger.info("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(actor_id,
                                                                            worker_id,
                                                                            execution_id,
                                                                            image,
                                                                            message,
                                                                            user,
                                                                            environment,
                                                                            privileged,
                                                                            mounts,
                                                                            leave_containers,
                                                                            fifo_host_path,
                                                                            socket_host_path)
        except DockerStartContainerError as e:
            logger.error("Got DockerStartContainerError: {}".format(e))
            Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e))
            continue
        # Add the completed stats to the execution
        logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time)
        logger.info("Added execution: {}".format(execution_id))

        # Add the logs to the execution
        Execution.set_logs(execution_id, logs)
        logger.info("Added execution logs.")

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info("worker {} got unexpected key error trying to update its execution time. "
                        "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running))
            if keep_running:
                logger.error("worker couldn't update's its execution time but keep_running is still true!")

        logger.info("worker time stamps updated.")
Exemple #3
0
def clean_up_apim_clients(tenant):
    """Check the list of clients registered in APIM and remove any that are associated with retired workers."""
    username = os.environ.get('_abaco_{}_username'.format(tenant), '')
    password = os.environ.get('_abaco_{}_password'.format(tenant), '')
    if not username:
        msg = "Health process did not get a username for tenant {}; " \
              "returning from clean_up_apim_clients".format(tenant)
        if tenant in ['SD2E', 'TACC-PROD']:
            logger.error(msg)
        else:
            logger.info(msg)
        return None
    if not password:
        msg = "Health process did not get a password for tenant {}; " \
              "returning from clean_up_apim_clients".format(tenant)
        if tenant in ['SD2E', 'TACC-PROD']:
            logger.error(msg)
        else:
            logger.info(msg)
        return None
    api_server = get_api_server(tenant)
    verify = get_tenant_verify(tenant)
    ag = Agave(api_server=api_server,
               username=username,
               password=password,
               verify=verify)
    logger.debug("health process created an ag for tenant: {}".format(tenant))
    try:
        cs = ag.clients.list()
        clients = cs.json()['result']
    except Exception as e:
        msg = "Health process got an exception trying to retrieve clients; exception: {}".format(
            e)
        logger.error(msg)
        return None
    for client in clients:
        # check if the name of the client is an abaco hash (i.e., a worker id). if not, we ignore it from the beginning
        name = client.get('name')
        if not is_hashid(name):
            logger.debug(
                "client {} is not an abaco hash id; skipping.".format(name))
            continue
        # we know this client came from a worker, so we need to check to see if the worker is still active;
        # first check if the worker even exists; if it does, the id will be the client name:
        worker = get_worker(name)
        if not worker:
            logger.info(
                "no worker associated with id: {}; deleting client.".format(
                    name))
            delete_client(ag, name)
            logger.info("client {} deleted by health process.".format(name))
            continue
        # if the worker exists, we should check the status:
        status = worker.get('status')
        if status == codes.ERROR:
            logger.info(
                "worker {} was in ERROR status so deleting client; worker: {}."
                .format(name, worker))
            delete_client(ag, name)
            logger.info("client {} deleted by health process.".format(name))
        else:
            logger.debug(
                "worker {} still active; not deleting client.".format(worker))
Exemple #4
0
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel
    for future communications.
    :return:
    """
    logger.debug("Top of subscribe(). worker_id: {}".format(worker_id))
    actor_ch = ActorMsgChannel(actor_id)
    # establish configs for this worker -------
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.debug("No leave_containers value configured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.debug("leave_containers: {}".format(leave_containers))

    try:
        mem_limit = Config.get('workers', 'mem_limit')
    except configparser.NoOptionError:
        logger.debug("No mem_limit value configured.")
        mem_limit = "-1"
    mem_limit = str(mem_limit)

    try:
        max_cpus = Config.get('workers', 'max_cpus')
    except configparser.NoOptionError:
        logger.debug("No max_cpus value configured.")
        max_cpus = "-1"

    logger.debug("max_cpus: {}".format(max_cpus))

    # instantiate an OAuth client python object if credentials were passed -----
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")

    # start a separate thread for handling messages sent to the worker channel ----
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()

    # subscribe to the actor message queue -----
    logger.info(
        "Worker subscribing to actor channel. worker_id: {}".format(worker_id))
    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # global tracks whether this worker should keep running.
    globals.keep_running = True

    # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a
    # message. Even though the message will be requeued, we do not want the worker to continue processing
    # indefinitely when a compute node is unhealthy.
    consecutive_errors = 0

    # main subscription loop -- processing messages from actor's mailbox
    while globals.keep_running:
        logger.debug("top of keep_running; worker id: {}".format(worker_id))
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            logger.debug(
                "updated worker status to READY in SUBSCRIBE; worker id: {}".
                format(worker_id))
            update_worker_status = False
        try:
            msg, msg_obj = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting. worker id: {}".format(
                worker_id))
            globals.keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))

        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error(
                "unexpected exception from call to update_worker_status. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            msg_obj.nack(requeue=True)
            raise e
        update_worker_status = True
        logger.info(
            "Received message {}. Starting actor container. worker id: {}".
            format(msg, worker_id))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        try:
            actor = Actor.from_db(actors_store[actor_id])
            execution_id = msg['_abaco_execution_id']
            content_type = msg['_abaco_Content_Type']
            mounts = actor.mounts
            logger.debug("actor mounts: {}".format(mounts))
        except Exception as e:
            logger.error(
                "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers',
                                              'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError) as e:
            logger.error(
                "No socket_host_path configured. Cannot manage results data. Nacking message"
            )
            Actor.set_status(
                actor_id,
                ERROR,
                status_message="Abaco instance not configured for results data."
            )
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e
        socket_host_path = '{}.sock'.format(
            os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({
            'host_path': socket_host_path,
            'container_path': '/_abaco_results.sock',
            'format': 'ro'
        })
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers',
                                                'fifo_host_path_dir')
            except (configparser.NoSectionError,
                    configparser.NoOptionError) as e:
                logger.error(
                    "No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(
                    actor_id,
                    ERROR,
                    status_message=
                    "Abaco instance not configured for binary data. Nacking message."
                )
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id,
                                          execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error(
                    "Could not create fifo_path. Nacking message. Exception: {}"
                    .format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            # add the fifo as a mount:
            mounts.append({
                'host_path': fifo_host_path,
                'container_path': '/_abaco_binary_data',
                'format': 'ro'
            })

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug(
            "Adding worker_id to execution. woker_id: {}".format(worker_id))
        try:
            Execution.add_worker_id(actor_id, execution_id, worker_id)
        except Exception as e:
            logger.error(
                "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}"
                .format(e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}; worker_id: {}".format(
            privileged, worker_id))

        # overlay resource limits if set on actor:
        if actor.mem_limit:
            mem_limit = actor.mem_limit
        if actor.max_cpus:
            max_cpus = actor.max_cpus

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_worker_id'] = worker_id
        environment['_abaco_container_repo'] = actor.image
        environment['_abaco_actor_state'] = actor.state
        environment['_abaco_actor_name'] = actor.name or 'None'
        logger.debug("Overlayed environment: {}; worker_id: {}".format(
            environment, worker_id))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info(
                    "Refreshed the tokens. Passed {} to the environment.".
                    format(token))
            except Exception as e:
                logger.error(
                    "Got an exception trying to get an access token. Stoping worker and nacking message. "
                    "Exception: {}".format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
        else:
            logger.info(
                "Agave client `ag` is None -- not passing access token; worker_id: {}"
                .format(worker_id))
        logger.info("Passing update environment: {}".format(environment))
        logger.info("About to execute actor; worker_id: {}".format(worker_id))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(
                actor_id, worker_id, execution_id, image, message, user,
                environment, privileged, mounts, leave_containers,
                fifo_host_path, socket_host_path, mem_limit, max_cpus)
        except DockerStartContainerError as e:
            logger.error(
                "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}."
                "Placing message back on queue.".format(
                    worker_id, e, execution_id))
            # if we failed to start the actor container, we leave the worker up and re-queue the original message
            msg_obj.nack(requeue=True)
            logger.debug('message requeued.')
            consecutive_errors += 1
            if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS:
                logger.error(
                    "Worker {} failed to successfully start actor for execution {} {} consecutive times; "
                    "Exception: {}. Putting the actor in error status and shutting "
                    "down workers.".format(worker_id, execution_id,
                                           MAX_WORKER_CONSECUTIVE_ERRORS, e))
                Actor.set_status(actor_id, ERROR,
                                 "Error executing container: {}; w".format(e))
                shutdown_workers(actor_id, delete_actor_ch=False)
                # wait for worker to be shutdown..
                time.sleep(60)
                break
            else:
                # sleep five seconds before getting a message again to give time for the compute
                # node and/or docker health to recover
                time.sleep(5)
                continue
        except DockerStopContainerError as e:
            logger.error(
                "Worker {} was not able to stop actor for execution: {}; Exception: {}. "
                "Putting the actor in error status and shutting down workers.".
                format(worker_id, execution_id, e))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # since the error was with stopping the actor, we will consider this message "processed"; this choice
            # could be reconsidered/changed
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        except Exception as e:
            logger.error(
                "Worker {} got an unexpected exception trying to run actor for execution: {}."
                "Putting the actor in error status and shutting down workers. "
                "Exception: {}; type: {}".format(worker_id, execution_id, e,
                                                 type(e)))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the
            # actor container; if the container was started, then another exception should be raised. Therefore,
            # we can assume here that the container was at least started and we can ack the message.
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        # ack the message
        msg_obj.ack()
        logger.debug(
            "container finished successfully; worker_id: {}".format(worker_id))
        # Add the completed stats to the execution
        logger.info(
            "Actor container finished successfully. Got stats object:{}".
            format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code, start_time)
        logger.info("Added execution: {}; worker_id: {}".format(
            execution_id, worker_id))

        # Add the logs to the execution
        try:
            Execution.set_logs(execution_id, logs)
            logger.debug("Successfully added execution logs.")
        except Exception as e:
            msg = "Got exception trying to set logs for exception {}; " \
                  "Exception: {}; worker_id: {}".format(execution_id, e, worker_id)
            logger.error(msg)

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
            logger.debug("worker execution time updated. worker_id: {}".format(
                worker_id))
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info(
                "worker {} got unexpected key error trying to update its execution time. "
                "Worker better be shutting down! keep_running: {}".format(
                    worker_id, globals.keep_running))
            if globals.keep_running:
                logger.error(
                    "worker couldn't update's its execution time but keep_running is still true!"
                )

        # we completed an execution successfully; reset the consecutive_errors counter
        consecutive_errors = 0
        logger.info(
            "worker time stamps updated; worker_id: {}".format(worker_id))
    logger.info(
        "global.keep_running no longer true. worker is now exited. worker id: {}"
        .format(worker_id))