Example #1
0
 def start_worker(self, image, tenant, worker_id):
     ch = WorkerChannel()
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, ch.name, worker_id)
     worker = Worker(tenant=tenant, **worker_dict)
     print(
         "worker started successfully, waiting on ack that image was pulled..."
     )
     result = ch.get()
     if result.get('status') == 'error':
         # there was a problem pulling the image; put the actor in an error state:
         msg = "got an error back from the worker. Message: {}", format(
             result)
         print(msg)
         if 'msg' in result:
             raise SpawnerException(message=result['msg'])
         else:
             raise SpawnerException(
                 message="Internal error starting worker process.")
     elif result['value']['status'] == 'ok':
         print("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         msg = "Got an error status from worker: {}. Raising an exception.".format(
             str(result))
         print(msg)
         raise SpawnerException(msg)
Example #2
0
def shutdown_worker(ch_name):
    """Gracefully shutdown a single worker."""
    logger.debug("shutdown_worker called for ch_name: {}".format(ch_name))
    ch = WorkerChannel(name=ch_name)
    ch.put("stop")
    logger.info(
        "A 'stop' message was sent to worker channel: {}".format(ch_name))
Example #3
0
 def delete(self, actor_id, ch_name):
     try:
         worker = get_worker(actor_id, ch_name)
     except WorkerException as e:
         raise APIException(e.message, 404)
     ch = WorkerChannel(name=ch_name)
     ch.put("stop")
     return ok(result=worker, msg="Worker scheduled to be stopped.")
Example #4
0
 def start_worker(self, image):
     ch = WorkerChannel()
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker = run_worker(image, ch._name)
     print("worker started successfully, waiting on ack that image was pulled...")
     result = ch.get()
     if result['value']['status'] == 'ok':
         print("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         print("Got an error status from worker: {}. Raising an exception.".format(str(result)))
         raise SpawnerException()
Example #5
0
 def start_worker(self, image, tenant):
     ch = WorkerChannel()
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, ch.name)
     worker = Worker(tenant=tenant, **worker_dict)
     print("worker started successfully, waiting on ack that image was pulled...")
     result = ch.get()
     if result["value"]["status"] == "ok":
         print("received ack from worker.")
         return ch, result["reply_to"], worker
     else:
         print("Got an error status from worker: {}. Raising an exception.".format(str(result)))
         raise SpawnerException()
Example #6
0
 def start_worker(self, image, tenant, worker_id):
     ch = SpawnerWorkerChannel(worker_id=worker_id)
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, worker_id)
     worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
     worker = Worker(tenant=tenant, **worker_dict)
     logger.info(
         "worker started successfully, waiting on ack that image was pulled..."
     )
     result = ch.get()
     logger.debug(
         "Got response back from worker. Response: {}".format(result))
     if result.get('status') == 'error':
         # there was a problem pulling the image; put the actor in an error state:
         msg = "Got an error back from the worker. Message: {}", format(
             result)
         logger.info(msg)
         if 'msg' in result:
             raise SpawnerException(message=result['msg'])
         else:
             logger.error(
                 "Spawner received invalid message from worker. 'msg' field missing. Message: {}"
                 .format(result))
             raise SpawnerException(
                 message="Internal error starting worker process.")
     elif result['value']['status'] == 'ok':
         logger.debug("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         msg = "Got an error status from worker: {}. Raising an exception.".format(
             str(result))
         logger.error(
             "Spawner received an invalid message from worker. Message: ".
             format(result))
         raise SpawnerException(msg)
Example #7
0
def shutdown_worker(worker_id, delete_actor_ch=True):
    """Gracefully shutdown a single worker."""
    logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id))
    ch = WorkerChannel(worker_id=worker_id)
    if not delete_actor_ch:
        ch.put("stop-no-delete")
    else:
        ch.put("stop")
    logger.info("A 'stop' message was sent to worker: {}".format(worker_id))
    ch.close()
Example #8
0
File: worker.py Project: TACC/abaco
def main(worker_ch_name, image):
    worker_ch = WorkerChannel(name=worker_ch_name)
    # first, attempt to pull image from docker hub:
    try:
        print("Worker pulling image {}...".format(image))
        pull_image(image)
    except DockerError as e:
        # return a message to the spawner that there was an error pulling image and abort
        worker_ch.put({'status': 'error', 'msg': str(e)})
        raise e
    # inform spawner that image pulled successfully
    print("Image pulled successfully")

    # wait to receive message from spawner that it is time to subscribe to the actor channel
    print("Worker waiting on message from spawner...")
    result = worker_ch.put_sync({'status': 'ok'})

    if result['status'] == 'error':
        print("Worker received error message from spawner: {}. Quiting...".format(str(result)))
        raise WorkerException(str(result))
    actor_id = result.get('actor_id')
    tenant = result.get('tenant')
    print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id))
    api_server = None
    client_id = None
    client_secret = None
    access_token = None
    refresh_token = None
    if result.get('client') == 'yes':
        api_server = result.get('api_server')
        client_id = result.get('client_id')
        client_secret = result.get('client_secret')
        access_token = result.get('access_token')
        refresh_token = result.get('refresh_token')
    else:
        print("Did not get client:yes, got client:{}".format(result.get('client')))
    Actor.set_status(actor_id, READY)
    subscribe(tenant,
              actor_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch)
Example #9
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Example #10
0
    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')
Example #11
0
def main(worker_ch_name, worker_id, image):
    worker_ch = WorkerChannel(name=worker_ch_name)
    # first, attempt to pull image from docker hub:
    try:
        print("Worker pulling image {}...".format(image))
        pull_image(image)
    except DockerError as e:
        # return a message to the spawner that there was an error pulling image and abort
        worker_ch.put({'status': 'error', 'msg': str(e)})
        raise e
    # inform spawner that image pulled successfully
    print("Image pulled successfully")

    # wait to receive message from spawner that it is time to subscribe to the actor channel
    print("Worker waiting on message from spawner...")
    result = worker_ch.put_sync({'status': 'ok'})

    if result['status'] == 'error':
        print("Worker received error message from spawner: {}. Quiting...".
              format(str(result)))
        raise WorkerException(str(result))
    actor_id = result.get('actor_id')
    tenant = result.get('tenant')
    print("Worker received ok from spawner. Message: {}, actor_id:{}".format(
        result, actor_id))
    api_server = None
    client_id = None
    client_secret = None
    access_token = None
    refresh_token = None
    if result.get('client') == 'yes':
        api_server = result.get('api_server')
        client_id = result.get('client_id')
        client_secret = result.get('client_secret')
        access_token = result.get('access_token')
        refresh_token = result.get('refresh_token')
    else:
        print("Did not get client:yes, got client:{}".format(
            result.get('client')))
    Actor.set_status(actor_id, READY)
    subscribe(tenant, actor_id, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch)
Example #12
0
File: health.py Project: TACC/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Example #13
0
    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(name=worker['ch_name'])
                    ch.put('stop')
Example #14
0
def main():
    """
    Main function for the worker process.

    This function
    """
    worker_id = os.environ.get('worker_id')
    image = os.environ.get('image')
    actor_id = os.environ.get('actor_id')
    revision = os.environ.get('revision')
    try:
        revision = int(revision)
    except ValueError:
        logger.error(f"worker did not get an integer revision number; got: {revision}; "
                     f"worker {actor_id}+{worker_id} exiting.")
        sys.exit()
    client_id = os.environ.get('client_id', None)
    client_access_token = os.environ.get('client_access_token', None)
    client_refresh_token = os.environ.get('client_refresh_token', None)
    tenant = os.environ.get('tenant', None)
    api_server = os.environ.get('api_server', None)
    client_secret = os.environ.get('client_secret', None)

    logger.info(f"Top of main() for worker: {worker_id}, image: {image}; revision: {revision}"
                f"actor_id: {actor_id}; client_id:{client_id}; tenant: {tenant}; api_server: {api_server}")
    spawner_worker_ch = SpawnerWorkerChannel(worker_id=worker_id)

    logger.debug("Worker waiting on message from spawner...")
    result = spawner_worker_ch.get_one()
    logger.debug("Worker received reply from spawner. result: {}.".format(result))

    # should be OK to close the spawner_worker_ch on the worker side since spawner was first client
    # to open it.
    spawner_worker_ch.delete()
    logger.debug('spawner_worker_ch closed.')
    if not client_id:
        logger.info("Did not get client id.")
    else:
        logger.info(f"Got a client; client_id: {client_id}")

    logger.info(f"Actor {actor_id} status set to READY. subscribing to inbox.")
    worker_ch = WorkerChannel(worker_id=worker_id)
    subscribe(tenant,
              actor_id,
              image,
              revision,
              worker_id,
              api_server,
              client_id,
              client_secret,
              client_access_token,
              client_refresh_token,
              worker_ch)
Example #15
0
    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""
        logger.debug("Top of stop_workers() for actor: {}.".format(actor_id))
        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            logger.debug("workers_store had no workers for actor: {}".format(actor_id))
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            logger.info("Found {} workers to stop.".format(len(workers_dict.items())))
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            logger.info("Actor channel closed for actor: {}".format(actor_id))
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(worker_id=worker['id'])
                    # since this is an update, there are new workers being started, so
                    # don't delete the actor msg channel:
                    ch.put('stop-no-delete')
                    logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id']))
                    ch.close()
                else:
                    logger.debug("skipping worker {} as it it not in worker_ids.".format(worker))
        else:
            logger.info("No workers to stop.")
Example #16
0
def shutdown_worker(worker_id):
    """Gracefully shutdown a single worker."""
    logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id))
    ch = WorkerChannel(worker_id=worker_id)
    ch.put("stop")
    logger.info("A 'stop' message was sent to worker: {}".format(worker_id))
    ch.close()
Example #17
0
    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers = json.loads(workers_store[actor_id])
            print("Found existing workers: {}".format(str(workers)))
        except KeyError:
            print("No existing workers.")
            workers = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers) > 0 :
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()

            # now, send messages to workers for a graceful shutdown:
            for worker in workers:
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')
Example #18
0
    def start_worker(self, image, tenant, actor_id, worker_id):
        ch = SpawnerWorkerChannel(worker_id=worker_id)
        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        while True:
            try:
                worker_dict = run_worker(image, actor_id, worker_id)
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("worker started successfully, waiting on ack that image was pulled...")
        result = ch.get()
        logger.debug("Got response back from worker. Response: {}".format(result))
        if result.get('status') == 'error':
            # there was a problem pulling the image; put the actor in an error state:
            msg = "Got an error back from the worker. Message: {}",format(result)
            logger.info(msg)
            if 'msg' in result:
                raise SpawnerException(message=result['msg'])
            else:
                logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result))
                raise SpawnerException(message="Internal error starting worker process.")
        elif result['value']['status'] == 'ok':
            logger.debug("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            msg = "Got an error status from worker: {}. Raising an exception.".format(str(result))
            logger.error("Spawner received an invalid message from worker. Message: ".format(result))
            raise SpawnerException(msg)
Example #19
0
def main(worker_ch_name, image):
    worker_ch = WorkerChannel(name=worker_ch_name)
    # first, attempt to pull image from docker hub:
    try:
        print("Worker pulling image {}...".format(image))
        pull_image(image)
    except DockerError as e:
        # return a message to the spawner that there was an error pulling image and abort
        worker_ch.put({"status": "error", "msg": str(e)})
        raise e
    # inform spawner that image pulled successfully
    print("Image pulled successfully")

    # wait to receive message from spawner that it is time to subscribe to the actor channel
    print("Worker waiting on message from spawner...")
    result = worker_ch.put_sync({"status": "ok"})

    if result["status"] == "error":
        print("Worker received error message from spawner: {}. Quiting...".format(str(result)))
        raise WorkerException(str(result))
    actor_id = result.get("actor_id")
    print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id))
    Actor.set_status(actor_id, READY)
    subscribe(actor_id, worker_ch)
Example #20
0
def shutdown_worker(actor_id, worker_id, delete_actor_ch=True):
    """Gracefully shutdown a single worker."
    actor_id (str) - the dbid of the associated actor.
    """
    logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id))
    # set the worker status to SHUTDOWN_REQUESTED:
    try:
        Worker.update_worker_status(actor_id, worker_id, SHUTDOWN_REQUESTED)
    except Exception as e:
        logger.error(f"worker got exception trying to update status to SHUTODWN_REQUESTED. actor_id: {actor_id};"
                     f"worker_id: {worker_id}; exception: {e}")
    ch = WorkerChannel(worker_id=worker_id)
    if not delete_actor_ch:
        ch.put("stop-no-delete")
    else:
        ch.put("stop")
    logger.info("A 'stop' message was sent to worker: {}".format(worker_id))
    ch.close()
Example #21
0
    def start_worker(self,
                     image,
                     tenant,
                     actor_id,
                     worker_id,
                     client_id,
                     client_access_token,
                     client_refresh_token,
                     ch,
                     api_server,
                     client_secret):

        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        # worker = get_worker(worker_id)
        # worker['status'] = PULLING_IMAGE
        Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE)
        try:
            logger.debug("Worker pulling image {}...".format(image))
            pull_image(image)
        except DockerError as e:
            # return a message to the spawner that there was an error pulling image and abort
            # this is not necessarily an error state: the user simply could have provided an
            # image name that does not exist in the registry. This is the first time we would
            # find that out.
            logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e))
            raise e
        logger.info("Image {} pulled successfully.".format(image))
        # Done pulling image
        # Run Worker Container
        while True:
            try:
                Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER)
                logger.debug('spawner creating worker container')
                worker_dict = run_worker(
                    image,
                    actor_id,
                    worker_id,
                    client_id,
                    client_access_token,
                    client_refresh_token,
                    tenant,
                    api_server,
                    client_secret

                )
                logger.debug(f'finished run worker; worker dict: {worker_dict}')
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        # todo - should we be calling kill_worker here? (it is called in the exception block of the else below)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        logger.debug('finished loop')
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        # if the actor is not already in READY status, set actor status to READY before worker status has been
        # set to READY.
        # it is possible the actor status is already READY because this request is the autoscaler starting a new worker
        # for an existing actor.
        actor = Actor.from_db(actors_store[actor_id])
        if not actor.status == READY:
            try:
                Actor.set_status(actor_id, READY, status_message=" ")
            except KeyError:
                # it is possible the actor was already deleted during worker start up; if
                # so, the worker should have a stop message waiting for it. starting subscribe
                # as usual should allow this process to work as expected.
                pass
        # finalize worker with READY status
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("calling add_worker for worker: {}.".format(worker))
        Worker.add_worker(actor_id, worker)

        ch.put('READY')  # step 4
        logger.info('sent message through channel')
Example #22
0
def main(worker_id, image):
    """
    Main function for the worker process.

    This function
    """
    logger.info("Entering main() for worker: {}, image: {}".format(
        worker_id, image))
    spawner_worker_ch = SpawnerWorkerChannel(worker_id=worker_id)

    # first, attempt to pull image from docker hub:
    try:
        logger.info("Worker pulling image {}...".format(image))
        pull_image(image)
    except DockerError as e:
        # return a message to the spawner that there was an error pulling image and abort
        # this is not necessarily an error state: the user simply could have provided an
        # image name that does not exist in the registry. This is the first time we would
        # find that out.
        logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e))
        spawner_worker_ch.put({'status': 'error', 'msg': str(e)})
        raise e
    logger.info("Image {} pulled successfully.".format(image))

    # inform spawner that image pulled successfully and, simultaneously,
    # wait to receive message from spawner that it is time to subscribe to the actor channel
    logger.debug("Worker waiting on message from spawner...")
    result = spawner_worker_ch.put_sync({'status': 'ok'})
    logger.info("Worker received reply from spawner. result: {}.".format(result))

    # should be OK to close the spawner_worker_ch on the worker side since spawner was first client
    # to open it.
    spawner_worker_ch.close()

    if result['status'] == 'error':
        # we do not expect to get an error response at this point. this needs investigation
        logger.error("Worker received error message from spawner: {}. Quiting...".format(str(result)))
        raise WorkerException(str(result))

    actor_id = result.get('actor_id')
    tenant = result.get('tenant')
    logger.info("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id))
    api_server = None
    client_id = None
    client_secret = None
    access_token = None
    refresh_token = None
    if result.get('client') == 'yes':
        logger.info("Got client: yes, result: {}".format(result))
        api_server = result.get('api_server')
        client_id = result.get('client_id')
        client_secret = result.get('client_secret')
        access_token = result.get('access_token')
        refresh_token = result.get('refresh_token')
    else:
        logger.info("Did not get client:yes, got result:{}".format(result))
    try:
        Actor.set_status(actor_id, READY, status_message=" ")
    except KeyError:
        # it is possible the actor was already deleted during worker start up; if
        # so, the worker should have a stop message waiting for it. starting subscribe
        # as usual should allow this process to work as expected.
        pass
    logger.info("Actor status set to READY. subscribing to inbox.")
    worker_ch = WorkerChannel(worker_id=worker_id)
    subscribe(tenant,
              actor_id,
              worker_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch)
Example #23
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        worker_id = worker['id']
        worker_status = worker.get('status')
        # if the worker has only been requested, it will not have a host_id. it is possible
        # the worker will ultimately get scheduled on a different host; however, if there is
        # some issue and the worker is "stuck" in the early phases, we should remove it..
        if 'host_id' not in worker:
            # check for an old create time
            worker_create_t = worker.get('create_time')
            # in versions prior to 1.9, worker create_time was not set until after it was READY
            if not worker_create_t:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str=
                    'Worker did not have a host_id or create_time field.')
            # if still no host after 5 minutes, delete it
            if worker_create_t < get_current_utc_time() - datetime.timedelta(
                    minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker did not have a host_id and had '
                    'old create_time field.')

        # ignore workers on different hosts because this health agent cannot interact with the
        # docker daemon responsible for the worker container..
        if not host_id == worker['host_id']:
            continue

        # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long
        if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN:
            worker_last_health_check_time = worker.get(
                'last_health_check_time')
            if not worker_last_health_check_time:
                worker_last_health_check_time = worker.get('create_time')
            if not worker_last_health_check_time:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN and no health checks.')
            elif worker_last_health_check_time < get_current_utc_time(
            ) - datetime.timedelta(minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN for too long.')

        # check if the worker has not responded to a health check recently; we use a relatively long period
        # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq.
        # this needs to be watched closely though...
        worker_last_health_check_time = worker.get('last_health_check_time')
        if not worker_last_health_check_time or \
                (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)):
            hard_delete_worker(
                actor_id,
                worker_id,
                reason_str='Worker has not health checked for too long.')

        # first send worker a health check
        logger.info(f"sending worker {worker_id} a health check")
        ch = WorkerChannel(worker_id=worker_id)
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            ch.put('status')
        except (channelpy.exceptions.ChannelTimeoutException, Exception) as e:
            logger.error(
                f"Got exception of type {type(e)} trying to send worker {worker_id} a "
                f"health check. e: {e}")
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))

        # now check if the worker has been idle beyond the max worker_ttl configured for this abaco:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            continue
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
Example #24
0
File: worker.py Project: TACC/abaco
def shutdown_worker(ch_name):
    """Gracefully shutdown a single worker."""
    ch = WorkerChannel(name=ch_name)
    ch.put("stop")
Example #25
0
def shutdown_worker(ch_name):
    """Gracefully shutdown a single worker."""
    ch = WorkerChannel(name=ch_name)
    ch.put("stop")
Example #26
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not host_id == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        result = None
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
                logger.info("worker {} deleted from store".format(worker_id))
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
            # if the put_sync timed out and we removed the worker, we also need to delete the channel
            # otherwise the un-acked message will remain.
            try:
                ch.delete()
            except Exception as e:
                logger.error(
                    "Got exception: {} while trying to delete worker channel for worker: {}"
                    .format(e, worker_id))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if result and not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")

        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
Example #27
0
    # call the main() function:
    try:
        main()
    except Exception as e:
        try:
            worker_id = os.environ.get('worker_id')
        except:
            logger.error(
                f"worker main thread got exception trying to get worker id from environment."
                f"not able to send stop-no-delete message to itself."
                f"worker_id: {worker_id}.")
            worker_id = ''
        if worker_id:
            try:
                ch = WorkerChannel(worker_id=worker_id)
                # since this is an exception, we don't know that the actor has been deleted
                # don't delete the actor msg channel:
                ch.put('stop-no-delete')
                logger.info(
                    f"Worker main loop sent 'stop-no-delete' message to itself; worker_id: {worker_id}."
                )
                ch.close()
                msg = "worker caught exception from main loop. worker exiting. e" \
                      "Exception: {} worker_id: {}".format(e, worker_id)
                logger.info(msg)
            except Exception as e:
                logger.error(
                    f"worker main thread got exception trying to send stop-no-delete message to itself;"
                    f"worker_id: {worker_id}.")
    keep_running = False
Example #28
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor"""
    workers = get_workers(actor_id)
    for worker in workers:
        ch = WorkerChannel(name=worker["ch_name"])
        ch.put("stop")
Example #29
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = int(float(worker.get('last_execution_time', 0)))
            if last_execution + ttl < time.time():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(worker['id'])
            else:
                logger.info("Still time left for this worker.")
        elif worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(worker['id'])
        else:
            logger.debug("Worker not in READY status, will postpone.")