def main(worker_ch_name, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({'status': 'error', 'msg': str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({'status': 'ok'}) if result['status'] == 'error': print("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: print("Did not get client:yes, got client:{}".format(result.get('client'))) Actor.set_status(actor_id, READY) subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)
def main(worker_ch_name, worker_id, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({'status': 'error', 'msg': str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({'status': 'ok'}) if result['status'] == 'error': print("Worker received error message from spawner: {}. Quiting...". format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') print("Worker received ok from spawner. Message: {}, actor_id:{}".format( result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: print("Did not get client:yes, got client:{}".format( result.get('client'))) Actor.set_status(actor_id, READY) subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)
def main(worker_ch_name, image): worker_ch = WorkerChannel(name=worker_ch_name) # first, attempt to pull image from docker hub: try: print("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort worker_ch.put({"status": "error", "msg": str(e)}) raise e # inform spawner that image pulled successfully print("Image pulled successfully") # wait to receive message from spawner that it is time to subscribe to the actor channel print("Worker waiting on message from spawner...") result = worker_ch.put_sync({"status": "ok"}) if result["status"] == "error": print("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get("actor_id") print("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) Actor.set_status(actor_id, READY) subscribe(actor_id, worker_ch)
def start_worker(self, image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret): # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 # worker = get_worker(worker_id) # worker['status'] = PULLING_IMAGE Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE) try: logger.debug("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) raise e logger.info("Image {} pulled successfully.".format(image)) # Done pulling image # Run Worker Container while True: try: Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER) logger.debug('spawner creating worker container') worker_dict = run_worker( image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret ) logger.debug(f'finished run worker; worker dict: {worker_dict}') except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) # todo - should we be calling kill_worker here? (it is called in the exception block of the else below) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break logger.debug('finished loop') worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) # if the actor is not already in READY status, set actor status to READY before worker status has been # set to READY. # it is possible the actor status is already READY because this request is the autoscaler starting a new worker # for an existing actor. actor = Actor.from_db(actors_store[actor_id]) if not actor.status == READY: try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass # finalize worker with READY status worker = Worker(tenant=tenant, **worker_dict) logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) ch.put('READY') # step 4 logger.info('sent message through channel')
def main(worker_id, image): """ Main function for the worker process. This function """ logger.info("Entering main() for worker: {}, image: {}".format( worker_id, image)) spawner_worker_ch = SpawnerWorkerChannel(worker_id=worker_id) # first, attempt to pull image from docker hub: try: logger.info("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) spawner_worker_ch.put({'status': 'error', 'msg': str(e)}) raise e logger.info("Image {} pulled successfully.".format(image)) # inform spawner that image pulled successfully and, simultaneously, # wait to receive message from spawner that it is time to subscribe to the actor channel logger.debug("Worker waiting on message from spawner...") result = spawner_worker_ch.put_sync({'status': 'ok'}) logger.info("Worker received reply from spawner. result: {}.".format(result)) # should be OK to close the spawner_worker_ch on the worker side since spawner was first client # to open it. spawner_worker_ch.close() if result['status'] == 'error': # we do not expect to get an error response at this point. this needs investigation logger.error("Worker received error message from spawner: {}. Quiting...".format(str(result))) raise WorkerException(str(result)) actor_id = result.get('actor_id') tenant = result.get('tenant') logger.info("Worker received ok from spawner. Message: {}, actor_id:{}".format(result, actor_id)) api_server = None client_id = None client_secret = None access_token = None refresh_token = None if result.get('client') == 'yes': logger.info("Got client: yes, result: {}".format(result)) api_server = result.get('api_server') client_id = result.get('client_id') client_secret = result.get('client_secret') access_token = result.get('access_token') refresh_token = result.get('refresh_token') else: logger.info("Did not get client:yes, got result:{}".format(result)) try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass logger.info("Actor status set to READY. subscribing to inbox.") worker_ch = WorkerChannel(worker_id=worker_id) subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch)