def put(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) # return ok(result={'update_image': str(update_image)}, # msg="Actor updated successfully.") return ok(result=actor.display(), msg="Actor updated successfully.")
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) args = self.validate_post() num = args.get('num') if not num or num == 0: num = 1 dbid = Actor.get_dbid(g.tenant, actor_id) workers = Worker.get_workers(dbid) if len(workers.items()) < num: worker_ids = [] num_to_add = int(num) - len(workers.items()) for idx in range(num_to_add): worker_ids.append(Worker.request_worker(actor_id)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: logger.debug("new image is the same. not updating actor.") args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) logger.debug("put new command on command channel to update actor.") return ok(result=actor.display(), msg="Actor updated successfully.")
def put(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: raise APIException( "actor not found: {}'".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, image=actor.image, tenant=args['tenant']) # return ok(result={'update_image': str(update_image)}, # msg="Actor updated successfully.") return ok(result=actor.display(), msg="Actor updated successfully.")
def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() self.tot_workers = 0 try: self.host_id = Config.get('spawner', 'host_id') except Exception as e: logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e)) raise e
def post(self): args = self.validate_post() args['executions'] = {} args['state'] = '' args['subscriptions'] = [] args['status'] = SUBMITTED actor = Actor(args) actors_store[actor.id] = actor.to_db() ch = CommandChannel() ch.put_cmd(actor_id=actor.id, image=actor.image) return ok(result=actor, msg="Actor created successfully.")
def post(self): args = self.validate_post() args['tenant'] = g.tenant args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, image=actor.image, tenant=args['tenant']) add_permission(g.user, actor.db_id, 'UPDATE') return ok(result=actor.display(), msg="Actor created successfully.", request=request)
def post(self, actor_id): """Start new workers for an actor""" try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: raise APIException("actor not found: {}'".format(actor_id), 404) args = self.validate_post() num = args.get("num") if not num or num == 0: num = 1 ch = CommandChannel() ch.put_cmd(actor_id=actor.id, image=actor.image, num=num, stop_existing=False) return ok(result=None, msg="Scheduled {} new worker(s) to start.".format(str(num)))
def ensure_one_worker(self): """This method will check the workers store for the actor and request a new worker if none exist.""" worker_id = Worker.ensure_one_worker(self.db_id) if worker_id: worker_ids = [worker_id] ch = CommandChannel() ch.put_cmd(actor_id=self.db_id, worker_ids=worker_ids, image=self.image, tenant=self.tenant, num=1, stop_existing=False) return worker_ids else: return None
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" logger.debug("top of POST /actors/{}/workers.".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() logger.debug("workers POST params validated. actor: {}.".format(actor_id)) num = args.get('num') if not num or num == 0: logger.debug("did not get a num: {}.".format(actor_id)) num = 1 logger.debug("ensuring at least {} workers. actor: {}.".format(num, actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug("did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) current_number_workers = len(workers.items()) if current_number_workers < num: logger.debug("There were only {} workers for actor: {} so we're adding more.".format(current_number_workers, actor_id)) worker_ids = [] num_to_add = int(num) - len(workers.items()) logger.info("adding {} more workers for actor {}".format(num_to_add, actor_id)) for idx in range(num_to_add): worker_ids.append(Worker.request_worker(actor_id)) logger.info("New worker ids: {}".format(worker_ids)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) logger.info("Message put on command channel for new worker ids: {}".format(worker_ids)) return ok(result=None, msg="Scheduled {} new worker(s) to start. There were only".format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format(actor_id, num))
def ensure_one_worker(self): """This method will check the workers store for the actor and request a new worker if none exist.""" logger.debug("top of Actor.ensure_one_worker().") worker_id = Worker.ensure_one_worker(self.db_id) logger.debug("Worker.ensure_one_worker returned worker_id: {}".format(worker_id)) if worker_id: worker_ids = [worker_id] logger.info("Actor.ensure_one_worker() putting message on command channel for worker_id: {}".format( worker_id)) ch = CommandChannel() ch.put_cmd(actor_id=self.db_id, worker_ids=worker_ids, image=self.image, tenant=self.tenant, num=1, stop_existing=False) return worker_ids else: logger.debug("Actor.ensure_one_worker() returning None.") return None
def scale_up(actor_id): tenant, aid = actor_id.decode('utf8').split('_') logger.debug( 'METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id) logger.info("New worker id: {}".format(worker_id)) if actor.queue: channel_name = actor.queue else: channel_name = 'default' ch = CommandChannel(name=channel_name) ch.put_cmd(actor_id=actor.db_id, worker_id=worker_id, image=actor.image, tenant=tenant, stop_existing=False) ch.close() logger.debug( 'METRICS Added worker successfully for {}'.format(actor_id)) return channel_name except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format( type(e), e, e.args)) return None
def put(self, actor_id): try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: raise APIException( "actor not found: {}'".format(actor_id), 404) args = self.validate_put() update_image = False args['name'] = actor['name'] args['id'] = actor['id'] args['executions'] = actor['executions'] args['state'] = actor['state'] if args['image'] == actor.image: args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED actor = Actor(args) actors_store[actor.id] = actor.to_db() if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.id, image=actor.image) return ok(result=actor, msg="Actor updated successfully.")
def check_metrics(self, actor_ids): for actor_id in actor_ids: logger.debug("TOP OF CHECK METRICS") query = { 'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')), 'time': datetime.datetime.utcnow().isoformat() + "Z" } r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query) data = json.loads(r.text)['data']['result'] change_rate = 0 try: previous_data = last_metric[actor_id] try: change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1]) except: logger.debug("Could not calculate change rate.") except: logger.info("No previous data yet for new actor {}".format(actor_id)) last_metric.update({actor_id: data}) # Add a worker if message count reaches a given number try: logger.debug("METRICS current message count: {}".format(data[0]['value'][1])) if int(data[0]['value'][1]) >= 1: tenant, aid = actor_id.decode('utf8').split('_') logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)] logger.info("New worker id: {}".format(worker_ids[0])) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=tenant, num=1, stop_existing=False) ch.close() logger.debug('METRICS Added worker successfully for {}'.format(actor_id)) except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args)) elif int(data[0]['value'][1]) <= 1: logger.debug("METRICS made it to scale down block") # Check the number of workers for this actor before deciding to scale down workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: if len(workers) == 1: logger.debug("METRICS only one worker, won't scale down") else: while len(workers) > 0: logger.debug('METRICS made it STATUS check') worker = workers.popitem()[1] logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': logger.debug("METRICS I MADE IT") # scale down try: shutdown_worker(worker['id']) continue except Exception as e: logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format(worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e)) except Exception as e: logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image previous_status = actor.status previous_owner = actor.owner args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant # user can force an update by setting the force param: update_image = args.get('force') if not update_image and args['image'] == previous_image: logger.debug("new image is the same and force was false. not updating actor.") logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status)) args['status'] = previous_status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server # we do not allow a PUT to override the owner in case the PUT is issued by another user args['owner'] = previous_owner use_container_uid = args.get('use_container_uid') if Config.get('web', 'case') == 'camel': use_container_uid = args.get('useContainerUid') try: use_tas = Config.get('workers', 'use_tas_uid') except configparser.NoOptionError: logger.debug("no use_tas_uid config.") use_tas = False if hasattr(use_tas, 'lower'): use_tas = use_tas.lower() == 'true' else: logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas)) logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid)) if use_tas and not use_container_uid: uid, gid, tasdir = get_tas_data(g.user, g.tenant) if uid and gid: args['uid'] = uid args['gid'] = gid if tasdir: args['tasdir'] = tasdir args['mounts'] = get_all_mounts(args) args['last_update_time'] = get_current_utc_time() logger.debug("update args: {}".format(args)) actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) if update_image: worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)] ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) ch.close() logger.debug("put new command on command channel to update actor.") # put could have been issued by a user with if not previous_owner == g.user: set_permission(g.user, actor.db_id, UPDATE) return ok(result=actor.display(), msg="Actor updated successfully.")
def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.cmd_ch = CommandChannel()
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers = json.loads(workers_store[actor_id]) print("Found existing workers: {}".format(str(workers))) except KeyError: print("No existing workers.") workers = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers) > 0 : # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for worker in workers: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # tell new workers to subscribe to the actor channel. for channel in anon_channels: channel.put({'status': 'ok', 'actor_id': actor_id}) if not stop_existing: workers = json.loads(workers_store[actor_id]) workers.extend(new_workers) workers_store[actor_id] = json.dumps(workers) else: workers_store[actor_id] = json.dumps(new_workers) def start_workers(self, actor_id, image, num_workers): print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = [] try: for i in range(num_workers): print("starting worker {}".format(str(i))) ch, anon_ch, worker = self.start_worker(image) print("channel for worker {} is: {}".format(str(i), ch._name)) channels.append(ch) anon_channels.append(anon_ch) workers.append(worker) except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}".format(str(e))) raise SpawnerException() return channels, anon_channels, workers def start_worker(self, image): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker = run_worker(image, ch._name) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException() def kill_worker(self, worker): pass
def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel()
def __init__(self): self.num_workers = int(Config.get("workers", "init_count")) self.cmd_ch = CommandChannel()
def create_gauges(actor_ids): """ Creates a Prometheus gauge for each actor id. The gauge is used to track the number of pending messages in the actor's queue. :param actor_ids: list of actors that should be processed. Does not include stateful actors or actors in a shutting down state. :return: """ logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids)) # dictionary mapping actor_ids to their message queue lengths inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) # first, make sure the actor still exists in the actor store try: actor = actors_store[actor_id] except KeyError: logger.error( f"actor {actor_id} does not exist in store; continuing to next actor." ) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) g = None else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) g = None # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id) msg_length = len(ch._queue._queue) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e ch.close() result = {'messages': msg_length} # add the actor's current message queue length to the inbox_lengths in-memory variable inbox_lengths[actor_id] = msg_length # if we were able to create the gauge, set it to the current message: if g: try: g.set(result['messages']) except Exception as e: logger.error( f"Got exception trying to set the messages on the gauge for actor: {actor_id}; " f"exception: {e}") logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} try: g.set(result['workers']) except Exception as e: logger.error( f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}" ) logger.debug( f"METRICS: {result['workers']} workers found for actor: {actor_id}." ) # Update this actor's command channel metric # channel_name = actor.get("queue") # # queues_list = Config.get('spawner', 'host_queues').replace(' ', '') # valid_queues = queues_list.split(',') # # if not channel_name or channel_name not in valid_queues: # channel_name = 'default' # # if not channel_name: # # TODO -- this must be changed. there is no way returning no arguments will result in # # anythng but an exception. The calling function is expecting 3 arguments... # # if we really want to blow up right here we should just raise an appropriate exception. # return # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do # is set the length of all of the different command channels once at the end of this loop. What was # happening instead was that it was only setting one of the command channel's lengths -- whatever command # channel happened to belong to the last actor in the loop. channel_name = 'default' ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug( f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}") ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers( actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() for idx, channel in enumerate(anon_channels): if generate_clients == 'true': print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['id'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format( client_msg.get('message'))) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format( client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) else: print("Not generating clients. Config value was: {}".format( generate_clients)) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) print("Done processing command.") def start_workers(self, actor_id, worker_ids, image, tenant, num_workers): print("starting {} workers. actor_id: {} image: {}".format( str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): worker_id = worker_ids[i] print("starting worker {} with id: {}".format(i, worker_id)) ch, anon_ch, worker = self.start_worker( image, tenant, worker_id) print("channel for worker {} is: {}".format(str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker_id] = worker except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR, status_message=e.message) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}". format(str(e))) raise SpawnerException(message=e.message) return channels, anon_channels, workers def start_worker(self, image, tenant, worker_id): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name, worker_id) worker = Worker(tenant=tenant, **worker_dict) print( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "got an error back from the worker. Message: {}", format( result) print(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) print(msg) raise SpawnerException(msg) def kill_worker(self, worker): pass
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() self.tot_workers = 0 try: self.host_id = Config.get('spawner', 'host_id') except Exception as e: logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e)) raise e def run(self): while True: # check resource threshold before subscribing while True: if self.overloaded(): logger.critical("METRICS - SPAWNER FOR HOST {} OVERLOADED!!!".format(self.host_id)) # self.update_status to OVERLOADED time.sleep(5) else: break cmd, msg_obj = self.cmd_ch.get_one() # directly ack the messages from the command channel; problems generated from starting workers are # handled downstream; e.g., by setting the actor in an ERROR state; command messages should not be re-queued msg_obj.ack() self.process(cmd) def get_tot_workers(self): logger.debug("top of get_tot_workers") self.tot_workers = 0 logger.debug('spawner host_id: {}'.format(self.host_id)) for k,v in workers_store.items(): for wid, worker in v.items(): if worker.get('host_id') == self.host_id: self.tot_workers += 1 logger.debug("returning total workers: {}".format(self.tot_workers)) return self.tot_workers def overloaded(self): logger.debug("top of overloaded") self.get_tot_workers() logger.info("total workers for this host: {}".format(self.tot_workers)) if self.tot_workers >= MAX_WORKERS: return True def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug("workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format(len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) # since this is an update, there are new workers being started, so # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id'])) ch.close() else: logger.info("No workers to stop.") def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("Spawner processing new command:{}".format(cmd)) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) logger.info("command params: actor_id: {} worker_ids: {} image: {} stop_existing: {} mum_workers: {}".format( actor_id, worker_ids, image, tenant, stop_existing, num_workers)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. logger.info("Spawner returning to main run loop.") return logger.info("Created new workers: {}".format(new_workers)) # stop any existing workers: if stop_existing: logger.info("Stopping existing workers: {}".format(worker_ids)) self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: # if we're not stopping the existing workers, we need to add each worker to the # actor's collection. for _, worker in new_workers.items(): logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) else: # since we're stopping the existing workers, the actor's collection should just # be equal to the new_workers. workers_store[actor_id] = new_workers logger.info("workers_store set to new_workers: {}.".format(new_workers)) # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() logger.info("Sending messages to new workers over anonymous channels to subscribe to inbox.") for idx, channel in enumerate(anon_channels): if generate_clients == 'true': worker_id = new_workers[list(new_workers)[idx]]['id'] logger.info("Getting client for worker number {}, id: {}".format(idx, worker_id)) client_ch = ClientsChannel() try: client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=worker_id, secret=self.secret) except ChannelTimeoutException as e: logger.error("Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e)) # put actor in an error state and return self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") client_ch.close() return client_ch.close() # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': logger.error("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) logger.debug("Sent OK message over anonymous worker channel.") # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) logger.debug("Sent OK message AND client over anonymous worker channel.") else: logger.info("Not generating clients. Config value was: {}".format(generate_clients)) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) logger.debug("Sent OK message over anonymous worker channel.") # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) channel.delete() # due to the race condition deleting channels (potentially before all workers have received all messages) # we put a sleep here. time.sleep(1) for ch in new_channels: try: # the new_channels are the spawnerworker channels so they can be deleted. ch.delete() except Exception as e: logger.error("Got exception trying to delete spawnerworker channel: {}".format(e)) logger.info("Done processing command.") def start_workers(self, actor_id, worker_ids, image, tenant, num_workers): logger.info("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): worker_id = worker_ids[i] logger.info("starting worker {} with id: {}".format(i, worker_id)) ch, anon_ch, worker = self.start_worker(image, tenant, actor_id, worker_id) logger.debug("channel for worker {} is: {}".format(str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker_id] = worker except SpawnerException as e: logger.info("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers self.error_out_actor(actor_id, worker_id, e.message) raise SpawnerException(message=e.message) return channels, anon_channels, workers def start_worker(self, image, tenant, actor_id, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 while True: try: worker_dict = run_worker(image, actor_id, worker_id) except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info("worker started successfully, waiting on ack that image was pulled...") result = ch.get() logger.debug("Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}",format(result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result)) raise SpawnerException(message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format(str(result)) logger.error("Spawner received an invalid message from worker. Message: ".format(result)) raise SpawnerException(msg) def error_out_actor(self, actor_id, worker_id, message): """In case of an error, put the actor in error state and kill all workers""" Actor.set_status(actor_id, ERROR, status_message=message) try: self.kill_worker(actor_id, worker_id) except DockerError as e: logger.info("Received DockerError trying to kill worker: {}. Exception: {}".format(worker_id, e)) logger.info("Spawner will continue on since this is exception processing.") def kill_worker(self, actor_id, worker_id): try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def create_gauges(actor_ids): logger.debug( "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids)) inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) try: actor = actors_store[actor_id] except KeyError: logger.error("actor {} does not exist.".format(actor_id)) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) # Update this actor's command channel metric channel_name = actor.get("queue") queues_list = Config.get('spawner', 'host_queues').replace(' ', '') valid_queues = queues_list.split(',') if not channel_name or channel_name not in valid_queues: channel_name = 'default' # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue) ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug("METRICS COMMAND CHANNEL {} size: {}".format( channel_name, command_gauge)) ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # send new workers their clients and tell them to subscribe to the actor channel. for idx, channel in enumerate(anon_channels): print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['ch_name'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) print("Done processing command.") def start_workers(self, actor_id, image, tenant, num_workers): print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): print("starting worker {}".format(str(i))) ch, anon_ch, worker = self.start_worker(image, tenant) print("channel for worker {} is: {}".format(str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker['ch_name']] = worker except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}".format(str(e))) raise SpawnerException() return channels, anon_channels, workers def start_worker(self, image, tenant): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name) worker = Worker(tenant=tenant, **worker_dict) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException() def kill_worker(self, worker): pass
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.queue = os.environ.get('queue', 'default') self.cmd_ch = CommandChannel(name=self.queue) self.tot_workers = 0 try: self.host_id = Config.get('spawner', 'host_id') except Exception as e: logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e)) raise e def run(self): while True: # check resource threshold before subscribing while True: if self.overloaded(): logger.critical("METRICS - SPAWNER FOR HOST {} OVERLOADED!!!".format(self.host_id)) # self.update_status to OVERLOADED time.sleep(5) else: break cmd, msg_obj = self.cmd_ch.get_one() # directly ack the messages from the command channel; problems generated from starting workers are # handled downstream; e.g., by setting the actor in an ERROR state; command messages should not be re-queued msg_obj.ack() try: self.process(cmd) except Exception as e: logger.error("spawner got an exception trying to process cmd: {}. " "Exception type: {}. Exception: {}".format(cmd, type(e), e)) def get_tot_workers(self): logger.debug("top of get_tot_workers") self.tot_workers = 0 logger.debug('spawner host_id: {}'.format(self.host_id)) for k,v in workers_store.items(): for wid, worker in v.items(): if worker.get('host_id') == self.host_id: self.tot_workers += 1 logger.debug("returning total workers: {}".format(self.tot_workers)) return self.tot_workers def overloaded(self): logger.debug("top of overloaded") self.get_tot_workers() logger.info("total workers for this host: {}".format(self.tot_workers)) if self.tot_workers >= MAX_WORKERS: return True def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug("workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format(len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) # since this is an update, there are new workers being started, so # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id'])) ch.close() else: logger.debug("skipping worker {} as it it not in worker_ids.".format(worker)) else: logger.info("No workers to stop.") def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("top of process; cmd: {}".format(cmd)) actor_id = cmd['actor_id'] try: actor = Actor.from_db(actors_store[actor_id]) except Exception as e: msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}" logger.error(msg) return worker_id = cmd['worker_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = 1 logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}" "stop_existing: {} num_workers: {}".format(actor_id, worker_id, image, tenant, stop_existing, num_workers)) # if the worker was sent a delete request before spawner received this message to create the worker, # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the # worker from the collection. try: logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED") worker = Worker.get_worker(actor_id, worker_id) logger.debug(f"spawner got worker; worker: {worker}") except Exception as e: logger.error(f"spawner got exception trying to retrieve worker. " f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return status = worker.get('status') if not status == REQUESTED: logger.debug(f"worker was NOT in REQUESTED status. status: {status}") if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR: logger.debug(f"worker status was {status}; spawner deleting worker and returning..") try: Worker.delete_worker(actor_id, worker_id) logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.") return except Exception as e: logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status." f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}") return else: logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.") return # worker status was REQUESTED; moving on to SPAWNER_SETUP ---- Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP) logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id)) client_id = None client_secret = None client_access_token = None client_refresh_token = None api_server = None client_secret = None # ---- Oauth client generation for the worker ------- # check if tenant and instance configured for client generation - try: generate_clients = Config.get('workers', f'{tenant}_generate_clients').lower() except: logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.") generate_clients = Config.get('workers', 'generate_clients').lower() logger.debug(f"final generate_clients: {generate_clients}") if generate_clients == "true": logger.debug("client generation was configured to be available; now checking the actor's token attr.") # updated 1.3.0-- check whether the actor requires a token: if actor.token: logger.debug("spawner starting client generation") client_id, \ client_access_token, \ client_refresh_token, \ api_server, \ client_secret = self.client_generation(actor_id, worker_id, tenant) else: logger.debug("actor's token attribute was False. Not generating client.") ch = SpawnerWorkerChannel(worker_id=worker_id) logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id)) try: worker = self.start_worker( image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret ) except Exception as e: msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e) logger.error(msg) self.error_out_actor(actor_id, worker_id, msg) if client_id: self.delete_client(tenant, actor_id, worker_id, client_id, client_secret) return logger.debug("Returned from start_worker; Created new worker: {}".format(worker)) ch.close() logger.debug("Client channel closed") if stop_existing: logger.info("Stopping existing workers: {}".format(worker_id)) # TODO - update status to stop_requested self.stop_workers(actor_id, [worker_id]) def client_generation(self, actor_id, worker_id, tenant): client_ch = ClientsChannel() try: client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, secret=self.secret ) except Exception as e: logger.error("Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e)) # put worker in an error state and return self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") client_ch.close() Worker.update_worker_status(actor_id, worker_id, ERROR) logger.critical("Client generation FAILED.") raise e client_ch.close() if client_msg.get('status') == 'error': logger.error("Error generating client: {}".format(client_msg.get('message'))) self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") Worker.update_worker_status(actor_id, worker_id, ERROR) raise SpawnerException("Error generating client") #TODO - clean up error message # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) return client_msg['client_id'], \ client_msg['access_token'], \ client_msg['refresh_token'], \ client_msg['api_server'], \ client_msg['client_secret'] def delete_client(self, tenant, actor_id, worker_id, client_id, secret): clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=client_id, secret=secret) if msg['status'] == 'ok': logger.info("Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format(worker_id, client_id)) else: logger.error("Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'], client_id, msg)) clients_ch.close() def start_worker(self, image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret): # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 # worker = get_worker(worker_id) # worker['status'] = PULLING_IMAGE Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE) try: logger.debug("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) raise e logger.info("Image {} pulled successfully.".format(image)) # Done pulling image # Run Worker Container while True: try: Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER) logger.debug('spawner creating worker container') worker_dict = run_worker( image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret ) logger.debug(f'finished run worker; worker dict: {worker_dict}') except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) # todo - should we be calling kill_worker here? (it is called in the exception block of the else below) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break logger.debug('finished loop') worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) # if the actor is not already in READY status, set actor status to READY before worker status has been # set to READY. # it is possible the actor status is already READY because this request is the autoscaler starting a new worker # for an existing actor. actor = Actor.from_db(actors_store[actor_id]) if not actor.status == READY: try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass # finalize worker with READY status worker = Worker(tenant=tenant, **worker_dict) logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) ch.put('READY') # step 4 logger.info('sent message through channel') def error_out_actor(self, actor_id, worker_id, message): """In case of an error, put the actor in error state and kill all workers""" logger.debug("top of error_out_actor for worker: {}_{}".format(actor_id, worker_id)) Actor.set_status(actor_id, ERROR, status_message=message) # first we try to stop workers using the "graceful" approach - try: self.stop_workers(actor_id, worker_ids=[]) logger.info("Spawner just stopped worker {}_{} in error_out_actor".format(actor_id, worker_id)) return except Exception as e: logger.error("spawner got exception trying to run stop_workers. Exception: {}".format(e)) try: self.kill_worker(actor_id, worker_id) logger.info("Spawner just killed worker {}_{} in error_out_actor".format(actor_id, worker_id)) except DockerError as e: logger.info("Received DockerError trying to kill worker: {}. Exception: {}".format(worker_id, e)) logger.info("Spawner will continue on since this is exception processing.") def kill_worker(self, actor_id, worker_id): try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def __init__(self): self.num_workers = int(Config.get("workers", "init_count")) self.secret = os.environ.get("_abaco_secret") self.cmd_ch = CommandChannel()