def scale_down(actor_id): workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: # if len(workers) == 1: # logger.debug("METRICS only one worker, won't scale down") # else: while len(workers) > 0: logger.debug('METRICS made it STATUS check') worker = workers.popitem()[1] logger.debug('METRICS SCALE DOWN current worker: {}'.format( worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': # scale down try: shutdown_worker(worker['id'], delete_actor_ch=False) continue except Exception as e: logger.debug( 'METRICS ERROR shutting down worker: {} - {} - {}'. format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format( worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e))
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) args = self.validate_post() num = args.get('num') if not num or num == 0: num = 1 dbid = Actor.get_dbid(g.tenant, actor_id) workers = Worker.get_workers(dbid) if len(workers.items()) < num: worker_ids = [] num_to_add = int(num) - len(workers.items()) for idx in range(num_to_add): worker_ids.append(Worker.request_worker(actor_id)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def get(self, actor_id): logger.debug("top of GET /actors/{}/workers for tenant {}.".format( actor_id, g.tenant)) dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug( "did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) result = [] for id, worker in workers.items(): worker.update({'id': id}) try: w = Worker(**worker) result.append(w.display()) except Exception as e: logger.error( "Unable to instantiate worker in workers endpoint from description: {}. " .format(worker)) return ok(result=result, msg="Workers retrieved successfully.")
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" logger.debug("top of POST /actors/{}/workers.".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() logger.debug( "workers POST params validated. actor: {}.".format(actor_id)) num = args.get('num') if not num or num == 0: logger.debug("did not get a num: {}.".format(actor_id)) num = 1 logger.debug("ensuring at least {} workers. actor: {}.".format( num, actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug( "did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) current_number_workers = len(workers.items()) if current_number_workers < num: logger.debug( "There were only {} workers for actor: {} so we're adding more." .format(current_number_workers, actor_id)) worker_ids = [] num_to_add = int(num) - len(workers.items()) logger.info("adding {} more workers for actor {}".format( num_to_add, actor_id)) for idx in range(num_to_add): worker_ids.append( Worker.request_worker(tenant=g.tenant, actor_id=actor_id)) logger.info("New worker ids: {}".format(worker_ids)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) ch.close() logger.info( "Message put on command channel for new worker ids: {}".format( worker_ids)) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def manage_workers(actor_id): """Scale workers for an actor if based on message queue size and policy.""" logger.info("Entering manage_workers for {}".format(actor_id)) try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: logger.info("Did not find actor; returning.") return workers = Worker.get_workers(actor_id)
def manage_workers(actor_id): """Scale workers for an actor if based on message queue size and policy.""" print("Entering manage_workers for {}".format(actor_id)) try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: print("Did not find actor; returning.") return workers = Worker.get_workers(actor_id)
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" logger.debug("shutdown_workers() called for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception from get_workers: {}".format(e)) if workers == {}: logger.info("shutdown_workers did not receive any workers from Worker.get_worker for actor: {}".format(actor_id)) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): shutdown_worker(worker['id'])
def create_gauges(actor_ids): logger.debug("METRICS: Made it to create_gauges") for actor_id in actor_ids: if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Gauge: {}".format( e)) else: g = message_gauges[actor_id] try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: g = worker_gaueges[actor_id] workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) return actor_ids
def manage_workers(actor_id): """Scale workers for an actor if based on message queue size and policy.""" logger.info("Entering manage_workers for {}".format(actor_id)) try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: logger.info("Did not find actor; returning.") return workers = Worker.get_workers(actor_id) for worker in workers: time_difference = time.time() - worker['create_time'] if worker['status'] == 'PROCESSING' and time_difference > 1: logger.info("LOOK HERE - worker creation time {}".format( worker['create_time']))
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available workers = Worker.get_workers(dbid) actor = Actor.from_db(actors_store[dbid]) if len(workers.items()) < 1: ch = CommandChannel() ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False) result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def get(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: raise APIException("actor not found: {}'".format(actor_id), 400) try: workers = Worker.get_workers(dbid) except WorkerException as e: raise APIException(e.msg, 404) result = [] for id, worker in workers.items(): worker.update({'id': id}) result.append(worker) return ok(result=result, msg="Workers retrieved successfully.")
def get(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 400) try: workers = Worker.get_workers(dbid) except WorkerException as e: raise ResourceError(e.msg, 404) result = [] for id, worker in workers.items(): worker.update({'id': id}) result.append(worker) return ok(result=result, msg="Workers retrieved successfully.")
def shutdown_workers(actor_id, delete_actor_ch=True): """ Graceful shutdown of all workers for an actor. Arguments: * actor_id (str) - the db_id of the actor * delete_actor_ch (bool) - whether the worker shutdown process should also delete the actor_ch. This should be true whenever the actor is being removed. This will also force quit any currently running executions. """ logger.debug("shutdown_workers() called for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception from get_workers: {}".format(e)) if not workers: logger.info("shutdown_workers did not receive any workers from Worker.get_workers for actor: {}".format(actor_id)) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for worker in workers: shutdown_worker(actor_id, worker['id'], delete_actor_ch)
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" workers = Worker.get_workers(actor_id) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): # if there is already a channel, tell the worker to shut itself down if 'ch_name' in worker: shutdown_worker(worker['ch_name']) else: # otherwise, just remove from db: try: worker_id = worker.get('id') Worker.delete_worker(actor_id, worker_id) except WorkerException as e: print( "Got a WorkerException trying to delete worker with id: {}; exception: {}" .format(worker_id, e))
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def get(self): logger.debug("top of GET /admin/actors") actors = [] for k, v in actors_store.items(): actor = Actor.from_db(v) actor.workers = Worker.get_workers(actor.db_id) for id, worker in actor.workers.items(): actor.worker = worker break ch = ActorMsgChannel(actor_id=actor.db_id) actor.messages = len(ch._queue._queue) ch.close() summary = ExecutionsSummary(db_id=actor.db_id) actor.executions = summary.total_executions actor.runtime = summary.total_runtime actors.append(actor) logger.info("actors retrieved.") return ok(result=actors, msg="Actors retrieved successfully.")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" logger.debug("shutdown_workers() called for actor: {}".format(actor_id)) workers = Worker.get_workers(actor_id) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): # if there is already a channel, tell the worker to shut itself down if 'ch_name' in worker: logger.info( "worker had channel: {}. calling shutdown_worker() for worker: {}" .format(worker['ch_name'], worker.get('id'))) shutdown_worker(worker['ch_name']) else: # otherwise, just remove from db: try: logger.info( "worker: {} did not have a channel. Deleting worker.". format(worker.get('id'))) worker_id = worker.get('id') Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.error( "Got a WorkerException trying to delete worker with id: {}; exception: {}" .format(worker_id, e))
def scale_down(actor_id, is_sync_actor=False): """ This function determines whether an actor's worker pool should be scaled down and if so, initiates the scaling down. :param actor_id: the actor_id :param is_sync_actor: whether or not the actor has the SYNC hint. :return: """ logger.debug(f"top of scale_down for actor_id: {actor_id}") # we retrieve the current workers again as we will need the entire worker ojects (not just the number). workers = Worker.get_workers(actor_id) logger.debug(f'scale_down number of workers: {len(workers)}') try: # iterate through all the actor's workers and determine if they should be shut down. while len(workers) > 0: # whether to check the TTL for this worker; we only check TTL for SYNC actors; for non-sync, # workers are immediately shut down when the actor has no messages. check_ttl = False sync_max_idle_time = 0 if len(workers) == 1 and is_sync_actor: logger.debug( "only one worker, on sync actor. checking worker idle time.." ) try: sync_max_idle_time = int( Config.get('workers', 'sync_max_idle_time')) except Exception as e: logger.error( f"Got exception trying to read sync_max_idle_time from config; e:{e}" ) sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME check_ttl = True worker = workers.pop() logger.debug(f"check_ttl: {check_ttl} for worker: {worker}") if check_ttl: try: last_execution = int( float(worker.get('last_execution_time', 0))) except Exception as e: logger.error( f"metrics got exception trying to compute last_execution! e: {e}" ) last_execution = 0 # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', 0) logger.debug("using last_execution: {}".format(last_execution)) try: last_execution = int(float(last_execution)) except: logger.error( "Could not cast last_execution {} to int(float()". format(last_execution)) last_execution = 0 if last_execution + sync_max_idle_time < time.time(): # shutdown worker logger.info("OK to shut down this worker -- beyond ttl.") # continue onto additional checks below else: logger.info( "Autoscaler not shuting down this worker - still time left." ) continue logger.debug('based on TTL, worker could be scaled down.') # check status of the worker is ready if worker['status'] == 'READY': # scale down logger.debug( 'worker was in READY status; attempting shutdown.') try: shutdown_worker(actor_id, worker['id'], delete_actor_ch=False) logger.debug('sent worker shutdown message.') continue except Exception as e: logger.debug( 'METRICS ERROR shutting down worker: {} - {} - {}'. format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format( worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e))
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: worker_id = worker['id'] worker_status = worker.get('status') # if the worker has only been requested, it will not have a host_id. it is possible # the worker will ultimately get scheduled on a different host; however, if there is # some issue and the worker is "stuck" in the early phases, we should remove it.. if 'host_id' not in worker: # check for an old create time worker_create_t = worker.get('create_time') # in versions prior to 1.9, worker create_time was not set until after it was READY if not worker_create_t: hard_delete_worker( actor_id, worker_id, reason_str= 'Worker did not have a host_id or create_time field.') # if still no host after 5 minutes, delete it if worker_create_t < get_current_utc_time() - datetime.timedelta( minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker did not have a host_id and had ' 'old create_time field.') # ignore workers on different hosts because this health agent cannot interact with the # docker daemon responsible for the worker container.. if not host_id == worker['host_id']: continue # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN: worker_last_health_check_time = worker.get( 'last_health_check_time') if not worker_last_health_check_time: worker_last_health_check_time = worker.get('create_time') if not worker_last_health_check_time: hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN and no health checks.') elif worker_last_health_check_time < get_current_utc_time( ) - datetime.timedelta(minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN for too long.') # check if the worker has not responded to a health check recently; we use a relatively long period # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq. # this needs to be watched closely though... worker_last_health_check_time = worker.get('last_health_check_time') if not worker_last_health_check_time or \ (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)): hard_delete_worker( actor_id, worker_id, reason_str='Worker has not health checked for too long.') # first send worker a health check logger.info(f"sending worker {worker_id} a health check") ch = WorkerChannel(worker_id=worker_id) try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) ch.put('status') except (channelpy.exceptions.ChannelTimeoutException, Exception) as e: logger.error( f"Got exception of type {type(e)} trying to send worker {worker_id} a " f"health check. e: {e}") finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) # now check if the worker has been idle beyond the max worker_ttl configured for this abaco: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") continue # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" workers = Worker.get_workers(actor_id) for _, worker in workers.items(): shutdown_worker(worker['ch_name'])
def check_metrics(self, actor_ids): for actor_id in actor_ids: logger.debug("TOP OF CHECK METRICS") query = { 'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')), 'time': datetime.datetime.utcnow().isoformat() + "Z" } r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query) data = json.loads(r.text)['data']['result'] change_rate = 0 try: previous_data = last_metric[actor_id] try: change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1]) except: logger.debug("Could not calculate change rate.") except: logger.info("No previous data yet for new actor {}".format(actor_id)) last_metric.update({actor_id: data}) # Add a worker if message count reaches a given number try: logger.debug("METRICS current message count: {}".format(data[0]['value'][1])) if int(data[0]['value'][1]) >= 1: tenant, aid = actor_id.decode('utf8').split('_') logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)] logger.info("New worker id: {}".format(worker_ids[0])) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=tenant, num=1, stop_existing=False) ch.close() logger.debug('METRICS Added worker successfully for {}'.format(actor_id)) except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args)) elif int(data[0]['value'][1]) <= 1: logger.debug("METRICS made it to scale down block") # Check the number of workers for this actor before deciding to scale down workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: if len(workers) == 1: logger.debug("METRICS only one worker, won't scale down") else: while len(workers) > 0: logger.debug('METRICS made it STATUS check') worker = workers.popitem()[1] logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': logger.debug("METRICS I MADE IT") # scale down try: shutdown_worker(worker['id']) continue except Exception as e: logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format(worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e)) except Exception as e: logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not host_id == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') result = None try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) logger.info("worker {} deleted from store".format(worker_id)) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) # if the put_sync timed out and we removed the worker, we also need to delete the channel # otherwise the un-acked message will remain. try: ch.delete() except Exception as e: logger.error( "Got exception: {} while trying to delete worker channel for worker: {}" .format(e, worker_id)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if result and not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def create_gauges(actor_ids): """ Creates a Prometheus gauge for each actor id. The gauge is used to track the number of pending messages in the actor's queue. :param actor_ids: list of actors that should be processed. Does not include stateful actors or actors in a shutting down state. :return: """ logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids)) # dictionary mapping actor_ids to their message queue lengths inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) # first, make sure the actor still exists in the actor store try: actor = actors_store[actor_id] except KeyError: logger.error( f"actor {actor_id} does not exist in store; continuing to next actor." ) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) g = None else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) g = None # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id) msg_length = len(ch._queue._queue) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e ch.close() result = {'messages': msg_length} # add the actor's current message queue length to the inbox_lengths in-memory variable inbox_lengths[actor_id] = msg_length # if we were able to create the gauge, set it to the current message: if g: try: g.set(result['messages']) except Exception as e: logger.error( f"Got exception trying to set the messages on the gauge for actor: {actor_id}; " f"exception: {e}") logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} try: g.set(result['workers']) except Exception as e: logger.error( f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}" ) logger.debug( f"METRICS: {result['workers']} workers found for actor: {actor_id}." ) # Update this actor's command channel metric # channel_name = actor.get("queue") # # queues_list = Config.get('spawner', 'host_queues').replace(' ', '') # valid_queues = queues_list.split(',') # # if not channel_name or channel_name not in valid_queues: # channel_name = 'default' # # if not channel_name: # # TODO -- this must be changed. there is no way returning no arguments will result in # # anythng but an exception. The calling function is expecting 3 arguments... # # if we really want to blow up right here we should just raise an appropriate exception. # return # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do # is set the length of all of the different command channels once at the end of this loop. What was # happening instead was that it was only setting one of the command channel's lengths -- whatever command # channel happened to belong to the last actor in the loop. channel_name = 'default' ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug( f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}") ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill logger.info("Checking health for worker: {}".format(worker)) ch = WorkerChannel(worker_id=worker['id']) worker_id = worker.get('id') try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: logger.info( "Worker did not respond, removing container and deleting worker." ) try: rm_container(worker['cid']) except DockerError: pass try: Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got exception trying to delete worker: {}".format(e)) finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) if not result == 'ok': logger.error( "Worker responded unexpectedly: {}, deleting worker.".format( result)) try: rm_container(worker['cid']) Worker.delete_worker(actor_id, worker_id) except Exception as e: logger.error( "Got error removing/deleting worker: {}".format(e)) else: # worker is healthy so update last health check: Worker.update_worker_health_time(actor_id, worker_id) logger.info("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") return # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = int(float(worker.get('last_execution_time', 0))) if last_execution + ttl < time.time(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(worker['id']) else: logger.info("Still time left for this worker.") elif worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(worker['id']) else: logger.debug("Worker not in READY status, will postpone.")
def scale_down(actor_id, is_sync_actor=False): logger.debug(f"top of scale_down for actor_id: {actor_id}") workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: while len(workers) > 0: logger.debug('METRICS made it STATUS check') check_ttl = False sync_max_idle_time = 0 if len(workers) == 1 and is_sync_actor: logger.debug( "only one worker, on sync actor. checking worker idle time.." ) try: sync_max_idle_time = int( Config.get('worker', 'sync_max_idle_time')) except Exception as e: logger.error( f"Got exception trying to read sync_max_idle_time from config; e:{e}" ) sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME check_ttl = True worker = workers.popitem()[1] if check_ttl: try: last_execution = int( float(worker.get('last_execution_time', 0))) except Exception as e: logger.error( f"metrics got exception trying to compute last_execution! e: {e}" ) last_execution = 0 # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', 0) logger.debug("using last_execution: {}".format(last_execution)) try: last_execution = int(float(last_execution)) except: logger.error( "Could not cast last_execution {} to int(float()". format(last_execution)) last_execution = 0 if last_execution + sync_max_idle_time < time.time(): # shutdown worker logger.info("OK to shut down this worker -- beyond ttl.") # continue onto additional checks below else: logger.info( "Autoscaler not shuting down this worker - still time left." ) break logger.debug('METRICS SCALE DOWN current worker: {}'.format( worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': # scale down try: shutdown_worker(actor_id, worker['id'], delete_actor_ch=False) continue except Exception as e: logger.debug( 'METRICS ERROR shutting down worker: {} - {} - {}'. format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format( worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e))
def create_gauges(actor_ids): logger.debug( "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids)) inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) try: actor = actors_store[actor_id] except KeyError: logger.error("actor {} does not exist.".format(actor_id)) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) # Update this actor's command channel metric channel_name = actor.get("queue") queues_list = Config.get('spawner', 'host_queues').replace(' ', '') valid_queues = queues_list.split(',') if not channel_name or channel_name not in valid_queues: channel_name = 'default' # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue) ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug("METRICS COMMAND CHANNEL {} size: {}".format( channel_name, command_gauge)) ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length