def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug("workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format(len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) # since this is an update, there are new workers being started, so # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id'])) ch.close() else: logger.debug("skipping worker {} as it it not in worker_ids.".format(worker)) else: logger.info("No workers to stop.")
def get(self, actor_id): def get_hypermedia(actor): return { '_links': { 'self': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), }, } logger.debug("top of GET /actors/{}/messages".format(actor_id)) # check that actor exists id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) ch = ActorMsgChannel(actor_id=id) result = {'messages': len(ch._queue._queue)} ch.close() logger.debug("messages found for actor: {}.".format(actor_id)) result.update(get_hypermedia(actor)) return ok(result)
def create_gauges(actor_ids): logger.debug("METRICS: Made it to create_gauges") for actor_id in actor_ids: if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Gauge: {}".format( e)) else: g = message_gauges[actor_id] try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: g = worker_gaueges[actor_id] workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) return actor_ids
def get(self): logger.debug("top of GET /admin/actors") actors = [] for k, v in actors_store.items(): actor = Actor.from_db(v) actor.workers = Worker.get_workers(actor.db_id) for id, worker in actor.workers.items(): actor.worker = worker break ch = ActorMsgChannel(actor_id=actor.db_id) actor.messages = len(ch._queue._queue) ch.close() summary = ExecutionsSummary(db_id=actor.db_id) actor.executions = summary.total_executions actor.runtime = summary.total_runtime actors.append(actor) logger.info("actors retrieved.") return ok(result=actors, msg="Actors retrieved successfully.")
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers = json.loads(workers_store[actor_id]) print("Found existing workers: {}".format(str(workers))) except KeyError: print("No existing workers.") workers = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers) > 0 : # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for worker in workers: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def get_metrics(self): logger.debug("top of get in MetricResource") actor_ids = [ db_id for db_id, _ in actors_store.items() ] logger.debug("ACTOR IDS: {}".format(actor_ids)) try: if actor_ids: for actor_id in actor_ids: if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format(actor_id.decode("utf-8").replace('-', '_')) ) message_gauges.update({actor_id: g}) except Exception as e: logger.info("got exception trying to instantiate the Gauge: {}".format(e)) else: g = message_gauges[actor_id] try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error("Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format(result['messages'], actor_id)) return actor_ids except Exception as e: logger.info("Got exception in get_metrics: {}".format(e)) return []
def process_link(link, msg, d): """ Process an event with a link. :return: """ # ensure that the linked actor still exists; the link attribute is *always* the dbid of the linked # actor logger.debug("top of process_link") try: actors_store[link] except KeyError as e: logger.error( "Processing event message for actor {} that does not exist. Quiting" .format(link)) raise e # create an execution for the linked actor with message exc = Execution.add_execution( link, { 'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': 'Abaco Event' }) logger.info( "Events processor agent added execution {} for actor {}".format( exc, link)) d['_abaco_execution_id'] = exc logger.debug( "sending message to actor. Final message {} and message dictionary: {}" .format(msg, d)) ch = ActorMsgChannel(actor_id=link) ch.put_msg(message=msg, d=d) ch.close() logger.info("link processed.")
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} logger.debug("top of POST /actors/{}/messages.".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. logger.debug("POST body validated. actor: {}.".format(actor_id)) for k, v in request.args.items(): if k == 'message': continue d[k] = v logger.debug("extra fields added to message from query parameters: {}.".format(d)) if hasattr(g, 'user'): d['_abaco_username'] = g.user logger.debug("_abaco_username: {} added to message.".format(g.user)) if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server logger.debug("_abaco_api_server: {} added to message.".format(g.api_server)) # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name logger.debug("abaco_jwt_header_name: {} added to message.".format(g.jwt_header_name)) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) logger.info("Execution {} added for actor {}".format(exc, actor_id)) d['_abaco_execution_id'] = exc d['_abaco_Content_Type'] = args.get('_abaco_Content_Type', '') logger.debug("Final message dictionary: {}".format(d)) ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) ch.close() logger.debug("Message added to actor inbox. id: {}.".format(actor_id)) # make sure at least one worker is available actor = Actor.from_db(actors_store[dbid]) actor.ensure_one_worker() logger.debug("ensure_one_worker() called. id: {}.".format(actor_id)) if args.get('_abaco_Content_Type') == 'application/octet-stream': result = {'execution_id': exc, 'msg': 'binary - omitted'} else: result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def create_gauges(actor_ids): logger.debug( "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids)) inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) try: actor = actors_store[actor_id] except KeyError: logger.error("actor {} does not exist.".format(actor_id)) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) # Update this actor's command channel metric channel_name = actor.get("queue") queues_list = Config.get('spawner', 'host_queues').replace(' ', '') valid_queues = queues_list.split(',') if not channel_name or channel_name not in valid_queues: channel_name = 'default' # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue) ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug("METRICS COMMAND CHANNEL {} size: {}".format( channel_name, command_gauge)) ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
def create_gauges(actor_ids): """ Creates a Prometheus gauge for each actor id. The gauge is used to track the number of pending messages in the actor's queue. :param actor_ids: list of actors that should be processed. Does not include stateful actors or actors in a shutting down state. :return: """ logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids)) # dictionary mapping actor_ids to their message queue lengths inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) # first, make sure the actor still exists in the actor store try: actor = actors_store[actor_id] except KeyError: logger.error( f"actor {actor_id} does not exist in store; continuing to next actor." ) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) g = None else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) g = None # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id) msg_length = len(ch._queue._queue) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e ch.close() result = {'messages': msg_length} # add the actor's current message queue length to the inbox_lengths in-memory variable inbox_lengths[actor_id] = msg_length # if we were able to create the gauge, set it to the current message: if g: try: g.set(result['messages']) except Exception as e: logger.error( f"Got exception trying to set the messages on the gauge for actor: {actor_id}; " f"exception: {e}") logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} try: g.set(result['workers']) except Exception as e: logger.error( f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}" ) logger.debug( f"METRICS: {result['workers']} workers found for actor: {actor_id}." ) # Update this actor's command channel metric # channel_name = actor.get("queue") # # queues_list = Config.get('spawner', 'host_queues').replace(' ', '') # valid_queues = queues_list.split(',') # # if not channel_name or channel_name not in valid_queues: # channel_name = 'default' # # if not channel_name: # # TODO -- this must be changed. there is no way returning no arguments will result in # # anythng but an exception. The calling function is expecting 3 arguments... # # if we really want to blow up right here we should just raise an appropriate exception. # return # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do # is set the length of all of the different command channels once at the end of this loop. What was # happening instead was that it was only setting one of the command channel's lengths -- whatever command # channel happened to belong to the last actor in the loop. channel_name = 'default' ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug( f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}") ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length