Ejemplo n.º 1
0
def scale_down(actor_id):
    workers = Worker.get_workers(actor_id)
    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
    try:
        # if len(workers) == 1:
        #     logger.debug("METRICS only one worker, won't scale down")
        # else:
        while len(workers) > 0:
            logger.debug('METRICS made it STATUS check')
            worker = workers.popitem()[1]
            logger.debug('METRICS SCALE DOWN current worker: {}'.format(
                worker['status']))
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                try:
                    shutdown_worker(worker['id'], delete_actor_ch=False)
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
Ejemplo n.º 2
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     num = args.get('num')
     if not num or num == 0:
         num = 1
     dbid = Actor.get_dbid(g.tenant, actor_id)
     workers = Worker.get_workers(dbid)
     if len(workers.items()) < num:
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         for idx in range(num_to_add):
             worker_ids.append(Worker.request_worker(actor_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
Ejemplo n.º 3
0
 def get(self, actor_id):
     logger.debug("top of GET /actors/{}/workers for tenant {}.".format(
         actor_id, g.tenant))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id),
                             404)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug(
             "did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     result = []
     for id, worker in workers.items():
         worker.update({'id': id})
         try:
             w = Worker(**worker)
             result.append(w.display())
         except Exception as e:
             logger.error(
                 "Unable to instantiate worker in workers endpoint from description: {}. "
                 .format(worker))
     return ok(result=result, msg="Workers retrieved successfully.")
Ejemplo n.º 4
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     logger.debug("top of POST /actors/{}/workers.".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id),
                             404)
     args = self.validate_post()
     logger.debug(
         "workers POST params validated. actor: {}.".format(actor_id))
     num = args.get('num')
     if not num or num == 0:
         logger.debug("did not get a num: {}.".format(actor_id))
         num = 1
     logger.debug("ensuring at least {} workers. actor: {}.".format(
         num, actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug(
             "did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     current_number_workers = len(workers.items())
     if current_number_workers < num:
         logger.debug(
             "There were only {} workers for actor: {} so we're adding more."
             .format(current_number_workers, actor_id))
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         logger.info("adding {} more workers for actor {}".format(
             num_to_add, actor_id))
         for idx in range(num_to_add):
             worker_ids.append(
                 Worker.request_worker(tenant=g.tenant, actor_id=actor_id))
         logger.info("New worker ids: {}".format(worker_ids))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         ch.close()
         logger.info(
             "Message put on command channel for new worker ids: {}".format(
                 worker_ids))
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
Ejemplo n.º 5
0
def manage_workers(actor_id):
    """Scale workers for an actor if based on message queue size and policy."""
    logger.info("Entering manage_workers for {}".format(actor_id))
    try:
        actor = Actor.from_db(actors_store[actor_id])
    except KeyError:
        logger.info("Did not find actor; returning.")
        return
    workers = Worker.get_workers(actor_id)
Ejemplo n.º 6
0
Archivo: health.py Proyecto: TACC/abaco
def manage_workers(actor_id):
    """Scale workers for an actor if based on message queue size and policy."""
    print("Entering manage_workers for {}".format(actor_id))
    try:
        actor = Actor.from_db(actors_store[actor_id])
    except KeyError:
        print("Did not find actor; returning.")
        return
    workers = Worker.get_workers(actor_id)
Ejemplo n.º 7
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    logger.debug("shutdown_workers() called for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception from get_workers: {}".format(e))
    if workers == {}:
        logger.info("shutdown_workers did not receive any workers from Worker.get_worker for actor: {}".format(actor_id))
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        shutdown_worker(worker['id'])
Ejemplo n.º 8
0
def create_gauges(actor_ids):
    logger.debug("METRICS: Made it to create_gauges")
    for actor_id in actor_ids:
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Gauge: {}".format(
                        e))
        else:
            g = message_gauges[actor_id]

        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            g = worker_gaueges[actor_id]
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    return actor_ids
Ejemplo n.º 9
0
def manage_workers(actor_id):
    """Scale workers for an actor if based on message queue size and policy."""
    logger.info("Entering manage_workers for {}".format(actor_id))
    try:
        actor = Actor.from_db(actors_store[actor_id])
    except KeyError:
        logger.info("Did not find actor; returning.")
        return
    workers = Worker.get_workers(actor_id)
    for worker in workers:
        time_difference = time.time() - worker['create_time']
        if worker['status'] == 'PROCESSING' and time_difference > 1:
            logger.info("LOOK HERE - worker creation time {}".format(
                worker['create_time']))
Ejemplo n.º 10
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        workers = Worker.get_workers(dbid)
        actor = Actor.from_db(actors_store[dbid])
        if len(workers.items()) < 1:
            ch = CommandChannel()
            ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False)
        result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
Ejemplo n.º 11
0
 def get(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[dbid])
     except KeyError:
         raise APIException("actor not found: {}'".format(actor_id), 400)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         raise APIException(e.msg, 404)
     result = []
     for id, worker in workers.items():
         worker.update({'id': id})
         result.append(worker)
     return ok(result=result, msg="Workers retrieved successfully.")
Ejemplo n.º 12
0
 def get(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[dbid])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 400)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         raise ResourceError(e.msg, 404)
     result = []
     for id, worker in workers.items():
         worker.update({'id': id})
         result.append(worker)
     return ok(result=result, msg="Workers retrieved successfully.")
Ejemplo n.º 13
0
def shutdown_workers(actor_id, delete_actor_ch=True):
    """
    Graceful shutdown of all workers for an actor. Arguments:
    * actor_id (str) - the db_id of the actor
    * delete_actor_ch (bool) - whether the worker shutdown process should also delete the actor_ch. This should be true
      whenever the actor is being removed. This will also force quit any currently running executions.
    """
    logger.debug("shutdown_workers() called for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception from get_workers: {}".format(e))
    if not workers:
        logger.info("shutdown_workers did not receive any workers from Worker.get_workers for actor: {}".format(actor_id))
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for worker in workers:
        shutdown_worker(actor_id, worker['id'], delete_actor_ch)
Ejemplo n.º 14
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    workers = Worker.get_workers(actor_id)
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        # if there is already a channel, tell the worker to shut itself down
        if 'ch_name' in worker:
            shutdown_worker(worker['ch_name'])
        else:
            # otherwise, just remove from db:
            try:
                worker_id = worker.get('id')
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                print(
                    "Got a WorkerException trying to delete worker with id: {}; exception: {}"
                    .format(worker_id, e))
Ejemplo n.º 15
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Ejemplo n.º 16
0
 def get(self):
     logger.debug("top of GET /admin/actors")
     actors = []
     for k, v in actors_store.items():
         actor = Actor.from_db(v)
         actor.workers = Worker.get_workers(actor.db_id)
         for id, worker in actor.workers.items():
             actor.worker = worker
             break
         ch = ActorMsgChannel(actor_id=actor.db_id)
         actor.messages = len(ch._queue._queue)
         ch.close()
         summary = ExecutionsSummary(db_id=actor.db_id)
         actor.executions = summary.total_executions
         actor.runtime = summary.total_runtime
         actors.append(actor)
     logger.info("actors retrieved.")
     return ok(result=actors, msg="Actors retrieved successfully.")
Ejemplo n.º 17
0
Archivo: health.py Proyecto: TACC/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Ejemplo n.º 18
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    logger.debug("shutdown_workers() called for actor: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        # if there is already a channel, tell the worker to shut itself down
        if 'ch_name' in worker:
            logger.info(
                "worker had channel: {}. calling shutdown_worker() for worker: {}"
                .format(worker['ch_name'], worker.get('id')))
            shutdown_worker(worker['ch_name'])
        else:
            # otherwise, just remove from db:
            try:
                logger.info(
                    "worker: {} did not have a channel. Deleting worker.".
                    format(worker.get('id')))
                worker_id = worker.get('id')
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.error(
                    "Got a WorkerException trying to delete worker with id: {}; exception: {}"
                    .format(worker_id, e))
Ejemplo n.º 19
0
def scale_down(actor_id, is_sync_actor=False):
    """
    This function determines whether an actor's worker pool should be scaled down and if so,
    initiates the scaling down.
    :param actor_id: the actor_id
    :param is_sync_actor: whether or not the actor has the SYNC hint.
    :return:
    """
    logger.debug(f"top of scale_down for actor_id: {actor_id}")
    # we retrieve the current workers again as we will need the entire worker ojects (not just the number).
    workers = Worker.get_workers(actor_id)
    logger.debug(f'scale_down number of workers: {len(workers)}')
    try:
        # iterate through all the actor's workers and determine if they should be shut down.
        while len(workers) > 0:
            # whether to check the TTL for this worker; we only check TTL for SYNC actors; for non-sync,
            # workers are immediately shut down when the actor has no messages.
            check_ttl = False
            sync_max_idle_time = 0
            if len(workers) == 1 and is_sync_actor:
                logger.debug(
                    "only one worker, on sync actor. checking worker idle time.."
                )
                try:
                    sync_max_idle_time = int(
                        Config.get('workers', 'sync_max_idle_time'))
                except Exception as e:
                    logger.error(
                        f"Got exception trying to read sync_max_idle_time from config; e:{e}"
                    )
                    sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME
                check_ttl = True
            worker = workers.pop()
            logger.debug(f"check_ttl: {check_ttl} for worker: {worker}")
            if check_ttl:
                try:
                    last_execution = int(
                        float(worker.get('last_execution_time', 0)))
                except Exception as e:
                    logger.error(
                        f"metrics got exception trying to compute last_execution! e: {e}"
                    )
                    last_execution = 0
                # if worker has made zero executions, use the create_time
                if last_execution == 0:
                    last_execution = worker.get('create_time', 0)
                logger.debug("using last_execution: {}".format(last_execution))
                try:
                    last_execution = int(float(last_execution))
                except:
                    logger.error(
                        "Could not cast last_execution {} to int(float()".
                        format(last_execution))
                    last_execution = 0
                if last_execution + sync_max_idle_time < time.time():
                    # shutdown worker
                    logger.info("OK to shut down this worker -- beyond ttl.")
                    # continue onto additional checks below
                else:
                    logger.info(
                        "Autoscaler not shuting down this worker - still time left."
                    )
                    continue

            logger.debug('based on TTL, worker could be scaled down.')
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                logger.debug(
                    'worker was in READY status; attempting shutdown.')
                try:
                    shutdown_worker(actor_id,
                                    worker['id'],
                                    delete_actor_ch=False)
                    logger.debug('sent worker shutdown message.')
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
Ejemplo n.º 20
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        worker_id = worker['id']
        worker_status = worker.get('status')
        # if the worker has only been requested, it will not have a host_id. it is possible
        # the worker will ultimately get scheduled on a different host; however, if there is
        # some issue and the worker is "stuck" in the early phases, we should remove it..
        if 'host_id' not in worker:
            # check for an old create time
            worker_create_t = worker.get('create_time')
            # in versions prior to 1.9, worker create_time was not set until after it was READY
            if not worker_create_t:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str=
                    'Worker did not have a host_id or create_time field.')
            # if still no host after 5 minutes, delete it
            if worker_create_t < get_current_utc_time() - datetime.timedelta(
                    minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker did not have a host_id and had '
                    'old create_time field.')

        # ignore workers on different hosts because this health agent cannot interact with the
        # docker daemon responsible for the worker container..
        if not host_id == worker['host_id']:
            continue

        # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long
        if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN:
            worker_last_health_check_time = worker.get(
                'last_health_check_time')
            if not worker_last_health_check_time:
                worker_last_health_check_time = worker.get('create_time')
            if not worker_last_health_check_time:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN and no health checks.')
            elif worker_last_health_check_time < get_current_utc_time(
            ) - datetime.timedelta(minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN for too long.')

        # check if the worker has not responded to a health check recently; we use a relatively long period
        # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq.
        # this needs to be watched closely though...
        worker_last_health_check_time = worker.get('last_health_check_time')
        if not worker_last_health_check_time or \
                (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)):
            hard_delete_worker(
                actor_id,
                worker_id,
                reason_str='Worker has not health checked for too long.')

        # first send worker a health check
        logger.info(f"sending worker {worker_id} a health check")
        ch = WorkerChannel(worker_id=worker_id)
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            ch.put('status')
        except (channelpy.exceptions.ChannelTimeoutException, Exception) as e:
            logger.error(
                f"Got exception of type {type(e)} trying to send worker {worker_id} a "
                f"health check. e: {e}")
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))

        # now check if the worker has been idle beyond the max worker_ttl configured for this abaco:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            continue
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
Ejemplo n.º 21
0
Archivo: worker.py Proyecto: TACC/abaco
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    workers = Worker.get_workers(actor_id)
    for _, worker in workers.items():
        shutdown_worker(worker['ch_name'])
Ejemplo n.º 22
0
    def check_metrics(self, actor_ids):
        for actor_id in actor_ids:
            logger.debug("TOP OF CHECK METRICS")

            query = {
                'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')),
                'time': datetime.datetime.utcnow().isoformat() + "Z"
            }
            r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query)
            data = json.loads(r.text)['data']['result']

            change_rate = 0
            try:
                previous_data = last_metric[actor_id]
                try:
                    change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1])
                except:
                    logger.debug("Could not calculate change rate.")
            except:
                logger.info("No previous data yet for new actor {}".format(actor_id))

            last_metric.update({actor_id: data})
            # Add a worker if message count reaches a given number
            try:
                logger.debug("METRICS current message count: {}".format(data[0]['value'][1]))
                if int(data[0]['value'][1]) >= 1:
                    tenant, aid = actor_id.decode('utf8').split('_')
                    logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id))
                    try:
                        # create a worker & add to this actor
                        actor = Actor.from_db(actors_store[actor_id])
                        worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)]
                        logger.info("New worker id: {}".format(worker_ids[0]))
                        ch = CommandChannel()
                        ch.put_cmd(actor_id=actor.db_id,
                                   worker_ids=worker_ids,
                                   image=actor.image,
                                   tenant=tenant,
                                   num=1,
                                   stop_existing=False)
                        ch.close()
                        logger.debug('METRICS Added worker successfully for {}'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args))
                elif int(data[0]['value'][1]) <= 1:
                    logger.debug("METRICS made it to scale down block")
                    # Check the number of workers for this actor before deciding to scale down
                    workers = Worker.get_workers(actor_id)
                    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
                    try:
                        if len(workers) == 1:
                            logger.debug("METRICS only one worker, won't scale down")
                        else:
                            while len(workers) > 0:
                                logger.debug('METRICS made it STATUS check')
                                worker = workers.popitem()[1]
                                logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status']))
                                # check status of the worker is ready
                                if worker['status'] == 'READY':
                                    logger.debug("METRICS I MADE IT")
                                    # scale down
                                    try:
                                        shutdown_worker(worker['id'])
                                        continue
                                    except Exception as e:
                                        logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args))
                                    logger.debug('METRICS shut down worker {}'.format(worker['id']))

                    except IndexError:
                        logger.debug('METRICS only one worker found for actor {}. '
                                     'Will not scale down'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS SCALE UP FAILED: {}".format(e))


            except Exception as e:
                logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
Ejemplo n.º 23
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not host_id == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        result = None
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
                logger.info("worker {} deleted from store".format(worker_id))
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
            # if the put_sync timed out and we removed the worker, we also need to delete the channel
            # otherwise the un-acked message will remain.
            try:
                ch.delete()
            except Exception as e:
                logger.error(
                    "Got exception: {} while trying to delete worker channel for worker: {}"
                    .format(e, worker_id))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if result and not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")

        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
Ejemplo n.º 24
0
def create_gauges(actor_ids):
    """
    Creates a Prometheus gauge for each actor id. The gauge is used to track the number of
    pending messages in the actor's queue.
    :param actor_ids: list of actors that should be processed. Does not include stateful actors or
    actors in a shutting down state.
    :return:
    """
    logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids))
    # dictionary mapping actor_ids to their message queue lengths
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))
        # first, make sure the actor still exists in the actor store
        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error(
                f"actor {actor_id} does not exist in store; continuing to next actor."
            )
            continue
        # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
                g = None
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))
                g = None
        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id)
            msg_length = len(ch._queue._queue)
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        ch.close()
        result = {'messages': msg_length}
        # add the actor's current message queue length to the inbox_lengths in-memory variable
        inbox_lengths[actor_id] = msg_length
        # if we were able to create the gauge, set it to the current message:
        if g:
            try:
                g.set(result['messages'])
            except Exception as e:
                logger.error(
                    f"Got exception trying to set the messages on the gauge for actor: {actor_id}; "
                    f"exception: {e}")
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        try:
            g.set(result['workers'])
        except Exception as e:
            logger.error(
                f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}"
            )
        logger.debug(
            f"METRICS: {result['workers']} workers found for actor: {actor_id}."
        )

        # Update this actor's command channel metric
        # channel_name = actor.get("queue")
        #
        # queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
        # valid_queues = queues_list.split(',')
        #
        # if not channel_name or channel_name not in valid_queues:
        #     channel_name = 'default'
        #
        # if not channel_name:
        #     # TODO -- this must be changed. there is no way returning no arguments will result in
        #     # anythng but an exception. The calling function is expecting 3 arguments...
        #     # if we really want to blow up right here we should just raise an appropriate exception.
        #     return

    # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do
    # is set the length of all of the different command channels once at the end of this loop. What was
    # happening instead was that it was only setting one of the command channel's lengths -- whatever command
    # channel happened to belong to the last actor in the loop.
    channel_name = 'default'
    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug(
        f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}")
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
Ejemplo n.º 25
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = int(float(worker.get('last_execution_time', 0)))
            if last_execution + ttl < time.time():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(worker['id'])
            else:
                logger.info("Still time left for this worker.")
        elif worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(worker['id'])
        else:
            logger.debug("Worker not in READY status, will postpone.")
Ejemplo n.º 26
0
def scale_down(actor_id, is_sync_actor=False):
    logger.debug(f"top of scale_down for actor_id: {actor_id}")
    workers = Worker.get_workers(actor_id)
    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
    try:
        while len(workers) > 0:
            logger.debug('METRICS made it STATUS check')
            check_ttl = False
            sync_max_idle_time = 0
            if len(workers) == 1 and is_sync_actor:
                logger.debug(
                    "only one worker, on sync actor. checking worker idle time.."
                )
                try:
                    sync_max_idle_time = int(
                        Config.get('worker', 'sync_max_idle_time'))
                except Exception as e:
                    logger.error(
                        f"Got exception trying to read sync_max_idle_time from config; e:{e}"
                    )
                    sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME
                check_ttl = True
            worker = workers.popitem()[1]
            if check_ttl:
                try:
                    last_execution = int(
                        float(worker.get('last_execution_time', 0)))
                except Exception as e:
                    logger.error(
                        f"metrics got exception trying to compute last_execution! e: {e}"
                    )
                    last_execution = 0
                # if worker has made zero executions, use the create_time
                if last_execution == 0:
                    last_execution = worker.get('create_time', 0)
                logger.debug("using last_execution: {}".format(last_execution))
                try:
                    last_execution = int(float(last_execution))
                except:
                    logger.error(
                        "Could not cast last_execution {} to int(float()".
                        format(last_execution))
                    last_execution = 0
                if last_execution + sync_max_idle_time < time.time():
                    # shutdown worker
                    logger.info("OK to shut down this worker -- beyond ttl.")
                    # continue onto additional checks below
                else:
                    logger.info(
                        "Autoscaler not shuting down this worker - still time left."
                    )
                    break

            logger.debug('METRICS SCALE DOWN current worker: {}'.format(
                worker['status']))
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                try:
                    shutdown_worker(actor_id,
                                    worker['id'],
                                    delete_actor_ch=False)
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
Ejemplo n.º 27
0
def create_gauges(actor_ids):
    logger.debug(
        "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids))
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))

        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error("actor {} does not exist.".format(actor_id))
            continue

            # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))

            # Update this actor's command channel metric
            channel_name = actor.get("queue")

            queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
            valid_queues = queues_list.split(',')

            if not channel_name or channel_name not in valid_queues:
                channel_name = 'default'

        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue)
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug("METRICS COMMAND CHANNEL {} size: {}".format(
        channel_name, command_gauge))
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length