Esempio n. 1
0
 def start_worker(self, image, tenant, worker_id):
     ch = SpawnerWorkerChannel(worker_id=worker_id)
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, worker_id)
     worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
     worker = Worker(tenant=tenant, **worker_dict)
     logger.info(
         "worker started successfully, waiting on ack that image was pulled..."
     )
     result = ch.get()
     logger.debug(
         "Got response back from worker. Response: {}".format(result))
     if result.get('status') == 'error':
         # there was a problem pulling the image; put the actor in an error state:
         msg = "Got an error back from the worker. Message: {}", format(
             result)
         logger.info(msg)
         if 'msg' in result:
             raise SpawnerException(message=result['msg'])
         else:
             logger.error(
                 "Spawner received invalid message from worker. 'msg' field missing. Message: {}"
                 .format(result))
             raise SpawnerException(
                 message="Internal error starting worker process.")
     elif result['value']['status'] == 'ok':
         logger.debug("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         msg = "Got an error status from worker: {}. Raising an exception.".format(
             str(result))
         logger.error(
             "Spawner received an invalid message from worker. Message: ".
             format(result))
         raise SpawnerException(msg)
Esempio n. 2
0
    def start_worker(self, image, tenant, actor_id, worker_id):
        ch = SpawnerWorkerChannel(worker_id=worker_id)
        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        while True:
            try:
                worker_dict = run_worker(image, actor_id, worker_id)
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("worker started successfully, waiting on ack that image was pulled...")
        result = ch.get()
        logger.debug("Got response back from worker. Response: {}".format(result))
        if result.get('status') == 'error':
            # there was a problem pulling the image; put the actor in an error state:
            msg = "Got an error back from the worker. Message: {}",format(result)
            logger.info(msg)
            if 'msg' in result:
                raise SpawnerException(message=result['msg'])
            else:
                logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result))
                raise SpawnerException(message="Internal error starting worker process.")
        elif result['value']['status'] == 'ok':
            logger.debug("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            msg = "Got an error status from worker: {}. Raising an exception.".format(str(result))
            logger.error("Spawner received an invalid message from worker. Message: ".format(result))
            raise SpawnerException(msg)
Esempio n. 3
0
    def start_worker(self,
                     image,
                     tenant,
                     actor_id,
                     worker_id,
                     client_id,
                     client_access_token,
                     client_refresh_token,
                     ch,
                     api_server,
                     client_secret):

        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        # worker = get_worker(worker_id)
        # worker['status'] = PULLING_IMAGE
        Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE)
        try:
            logger.debug("Worker pulling image {}...".format(image))
            pull_image(image)
        except DockerError as e:
            # return a message to the spawner that there was an error pulling image and abort
            # this is not necessarily an error state: the user simply could have provided an
            # image name that does not exist in the registry. This is the first time we would
            # find that out.
            logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e))
            raise e
        logger.info("Image {} pulled successfully.".format(image))
        # Done pulling image
        # Run Worker Container
        while True:
            try:
                Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER)
                logger.debug('spawner creating worker container')
                worker_dict = run_worker(
                    image,
                    actor_id,
                    worker_id,
                    client_id,
                    client_access_token,
                    client_refresh_token,
                    tenant,
                    api_server,
                    client_secret

                )
                logger.debug(f'finished run worker; worker dict: {worker_dict}')
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        # todo - should we be calling kill_worker here? (it is called in the exception block of the else below)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        logger.debug('finished loop')
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        # if the actor is not already in READY status, set actor status to READY before worker status has been
        # set to READY.
        # it is possible the actor status is already READY because this request is the autoscaler starting a new worker
        # for an existing actor.
        actor = Actor.from_db(actors_store[actor_id])
        if not actor.status == READY:
            try:
                Actor.set_status(actor_id, READY, status_message=" ")
            except KeyError:
                # it is possible the actor was already deleted during worker start up; if
                # so, the worker should have a stop message waiting for it. starting subscribe
                # as usual should allow this process to work as expected.
                pass
        # finalize worker with READY status
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("calling add_worker for worker: {}.".format(worker))
        Worker.add_worker(actor_id, worker)

        ch.put('READY')  # step 4
        logger.info('sent message through channel')