def check_new_params(self, cmd): """Additional checks for new client requests.""" valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: m = "Unable to look up actor with id: {}".format( cmd.get('actor_id')) logger.error(m) return False, m, None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: m = "Unable to look up worker: {}".format(e.msg) logger.error(m) return False, m, None logger.debug("new params were valid.") owner_prefix = get_tenant_userstore_prefix(actor.tenant) logger.debug( f"using owner prefix: {owner_prefix} for tenant: {actor.tenant}") if owner_prefix: owner = f"{owner_prefix}/{actor.owner}" else: owner = actor.owner logger.debug(f"using owner: {owner}") return valid, msg, owner
def check_new_params(self, cmd): valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), ch_name=cmd.get('worker_id')) except WorkerException as e: return False, "Unable to look up worker: {}".format(e.msg), None return valid, msg, actor.owner
def check_new_params(self, cmd): valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: return False, "Unable to look up worker: {}".format(e.msg), None return valid, msg, actor.owner
def error_out_actor(self, actor_id, worker_id, message): """In case of an error, put the actor in error state and kill all workers""" logger.debug("top of error_out_actor for worker: {}_{}".format( actor_id, worker_id)) # it is possible the actor was deleted already -- only set the actor status to ERROR if # it still exists in the store actor = actors_store.get(actor_id) if actor: Actor.set_status(actor_id, ERROR, status_message=message) # check to see how far the spawner got setting up the worker: try: worker = Worker.get_worker(actor_id, worker_id) worker_status = worker.get('status') logger.debug( f"got worker status for {actor_id}_{worker_id}; status: {worker_status}" ) except Exception as e: logger.debug( f"got exception in error_out_actor trying to determine worker status for {actor_id}_{worker_id}; " f"e:{e};") # skip all worker processing is skipped. return if worker_status == UPDATING_STORE or worker_status == READY or worker_status == BUSY: logger.debug( f"worker status was: {worker_status}; trying to stop_worker") # for workers whose containers are running, we first try to stop workers using the "graceful" approach - try: self.stop_workers(actor_id, worker_ids=[]) logger.info( "Spawner just stopped worker {}_{} in error_out_actor". format(actor_id, worker_id)) return except Exception as e: logger.error( "spawner got exception trying to run stop_workers. Exception: {}" .format(e)) logger.info( "setting worker_status to ERROR so that kill_worker will run." ) worker_status = ERROR # if the spawner was never able to start the worker container, we need to simply delete the worker record if worker_status == REQUESTED or worker_status == SPAWNER_SETUP or worker_status == PULLING_IMAGE or \ worker_status == ERROR: logger.debug( f"worker status was: {worker_status}; trying to kill_worker") try: self.kill_worker(actor_id, worker_id) logger.info( "Spawner just killed worker {}_{} in error_out_actor". format(actor_id, worker_id)) except DockerError as e: logger.info( "Received DockerError trying to kill worker: {}. Exception: {}" .format(worker_id, e)) logger.info( "Spawner will continue on since this is exception processing." )
def delete(self, actor_id, ch_name): id = Actor.get_dbid(g.tenant, actor_id) try: worker = Worker.get_worker(id, ch_name) except WorkerException as e: raise APIException(e.msg, 404) shutdown_worker(ch_name) return ok(result=None, msg="Worker scheduled to be stopped.")
def delete(self, actor_id, worker_id): id = Actor.get_dbid(g.tenant, actor_id) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: raise ResourceError(e.msg, 404) shutdown_worker(worker['ch_name']) return ok(result=None, msg="Worker scheduled to be stopped.")
def get(self, actor_id, ch_name): id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: raise WorkerException("actor not found: {}'".format(actor_id)) try: worker = Worker.get_worker(id, ch_name) except WorkerException as e: raise APIException(e.msg, 404) return ok(result=worker, msg="Worker retrieved successfully.")
def get(self, actor_id, worker_id): id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: raise WorkerException("actor not found: {}'".format(actor_id)) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: raise ResourceError(e.msg, 404) return ok(result=worker, msg="Worker retrieved successfully.")
def delete(self, actor_id, worker_id): logger.debug("top of DELETE /actors/{}/workers/{}.".format(actor_id, worker_id)) id = Actor.get_dbid(g.tenant, actor_id) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id)) raise ResourceError(e.msg, 404) logger.info("calling shutdown_worker(). worker: {}. actor: {}.".format(worker_id, actor_id)) shutdown_worker(worker['id']) logger.info("shutdown_worker() called for worker: {}. actor: {}.".format(worker_id, actor_id)) return ok(result=None, msg="Worker scheduled to be stopped.")
def check_new_params(self, cmd): """Additional checks for new client requests.""" valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: m = "Unable to look up actor with id: {}".format( cmd.get('actor_id')) logger.error(m) return False, m, None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: m = "Unable to look up worker: {}".format(e.msg) logger.error(m) return False, m, None logger.debug("new params were valid.") return valid, msg, actor.owner
def get(self, actor_id, worker_id): logger.debug("top of GET /actors/{}/workers/{}.".format(actor_id, worker_id)) id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: logger.debug("Did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id)) raise ResourceError(e.msg, 404) return ok(result=worker, msg="Worker retrieved successfully.")
def get(self, actor_id, worker_id): logger.debug("top of GET /actors/{}/workers/{}.".format(actor_id, worker_id)) id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: logger.debug("Did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id)) raise ResourceError(e.msg, 404) # worker is an honest python dictionary with a single key, the id of the worker. need to # convert it to a Worker object worker.update({'id': worker_id}) w = Worker(**worker) return ok(result=w.display(), msg="Worker retrieved successfully.")
def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("top of process; cmd: {}".format(cmd)) actor_id = cmd['actor_id'] try: actor = Actor.from_db(actors_store[actor_id]) except Exception as e: msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}" logger.error(msg) return worker_id = cmd['worker_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = 1 logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}" "stop_existing: {} num_workers: {}".format(actor_id, worker_id, image, tenant, stop_existing, num_workers)) # if the worker was sent a delete request before spawner received this message to create the worker, # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the # worker from the collection. try: logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED") worker = Worker.get_worker(actor_id, worker_id) logger.debug(f"spawner got worker; worker: {worker}") except Exception as e: logger.error(f"spawner got exception trying to retrieve worker. " f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return status = worker.get('status') if not status == REQUESTED: logger.debug(f"worker was NOT in REQUESTED status. status: {status}") if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR: logger.debug(f"worker status was {status}; spawner deleting worker and returning..") try: Worker.delete_worker(actor_id, worker_id) logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.") return except Exception as e: logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status." f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return else: logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.") return # worker status was REQUESTED; moving on to SPAWNER_SETUP ---- Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP) logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id)) client_id = None client_secret = None client_access_token = None client_refresh_token = None api_server = None client_secret = None # ---- Oauth client generation for the worker ------- # check if tenant and instance configured for client generation - try: generate_clients = Config.get('workers', f'{tenant}_generate_clients').lower() except: logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.") generate_clients = Config.get('workers', 'generate_clients').lower() logger.debug(f"final generate_clients: {generate_clients}") if generate_clients == "true": logger.debug("client generation was configured to be available; now checking the actor's token attr.") # updated 1.3.0-- check whether the actor requires a token: if actor.token: logger.debug("spawner starting client generation") client_id, \ client_access_token, \ client_refresh_token, \ api_server, \ client_secret = self.client_generation(actor_id, worker_id, tenant) else: logger.debug("actor's token attribute was False. Not generating client.") ch = SpawnerWorkerChannel(worker_id=worker_id) logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id)) try: worker = self.start_worker( image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret ) except Exception as e: msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e) logger.error(msg) self.error_out_actor(actor_id, worker_id, msg) if client_id: self.delete_client(tenant, actor_id, worker_id, client_id, client_secret) return logger.debug("Returned from start_worker; Created new worker: {}".format(worker)) ch.close() logger.debug("Client channel closed") if stop_existing: logger.info("Stopping existing workers: {}".format(worker_id)) # TODO - update status to stop_requested self.stop_workers(actor_id, [worker_id])