def main(): """ The entrypoint for the zoe-observer script. :return: int """ load_configuration() args = get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list('zoe.{}'.format(get_conf().container_name_prefix)) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.container_id'] container_died(zoe_id) check_guests(swarm) time.sleep(get_conf().loop_time) except KeyboardInterrupt: break except Exception: log.exception('Something bad happened')
def guest_check_thread(args): swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list( {'zoe.deployment_name': get_conf().deployment_name}) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.service.id'] try: container_died(zoe_id) except ZoeAPIException: log.warning( 'Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting' ) swarm.terminate_container(c['id'], delete=True) check_guests(swarm) time.sleep(get_conf().loop_time) except Exception: log.exception('Something bad happened')
def container_died(service_id): log.debug('A container died') # tell the master via the rest api cont_api = ZoeServiceAPI(get_conf().master_url, 'zoeadmin', get_conf().zoeadmin_password) try: cont_api.died(service_id) except ZoeAPIException as e: if e.message != "No such service": log.exception('Error reporting a dead service')
def guest_check_thread(args): swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list({'zoe.deployment_name': get_conf().deployment_name}) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.service.id'] try: container_died(zoe_id) except ZoeAPIException: log.warning('Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting') swarm.terminate_container(c['id'], delete=True) check_guests(swarm) time.sleep(get_conf().loop_time) except Exception: log.exception('Something bad happened')
def check_guests(swarm): query_api = ZoeQueryAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password) exec_api = ZoeExecutionsAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password) cont_api = ZoeContainerAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password) guests = query_api.query('user', role='guest') execs = exec_api.list() for guest in guests: my_execs = [e for e in execs if e['owner'] == guest['id']] for my_exec in my_execs: if len(my_exec['containers']) == 0: continue my_exec_since_started = datetime.datetime.now() - dateutil.parser.parse(my_exec['time_started']) my_exec_since_started = my_exec_since_started.total_seconds() terminate = False for c in my_exec['containers']: c = cont_api.get(c) for port in c['ports']: if port['name'] == 'Spark application web interface': if check_spark_job(swarm, c['docker_id'], my_exec_since_started): log.info('Execution {} for user {} has been idle for too long, terminating...'.format(my_exec['name'], guest['name'])) terminate = True break if terminate: break if terminate: exec_api.terminate(my_exec['id'])
def main_callback(event): if event['Type'] != 'container': return try: if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf().deployment_name: return except KeyError: return log.debug(event) if event['Action'] == "die": try: service_id = event['Actor']['Attributes']['zoe.service.id'] container_died(service_id) except KeyError: return
def main_callback(event): if event['Type'] != 'container': return try: if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf( ).deployment_name: return except KeyError: return log.debug(event) if event['Action'] == "die": try: service_id = event['Actor']['Attributes']['zoe.service.id'] container_died(service_id) except KeyError: return
def main(): """ The entrypoint for the zoe-observer script. :return: int """ load_configuration() args = get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) th = threading.Thread(target=guest_check_thread, name="guest-check", args=[args], daemon=True) th.start() swarm_events_thread(args)
def check_guests(swarm): query_api = ZoeQueryAPI(get_conf().master_url, 'zoeadmin', get_conf().zoeadmin_password) exec_api = ZoeExecutionsAPI(get_conf().master_url, 'zoeadmin', get_conf().zoeadmin_password) cont_api = ZoeServiceAPI(get_conf().master_url, 'zoeadmin', get_conf().zoeadmin_password) guests = query_api.query('user', role='guest') execs = exec_api.list() for guest in guests: my_execs = [e for e in execs if e['owner'] == guest['name']] for my_exec in my_execs: if len(my_exec['services']) == 0: continue my_exec_since_started = datetime.datetime.now( ) - dateutil.parser.parse(my_exec['time_started']) my_exec_since_started = my_exec_since_started.total_seconds() terminate = False for c in my_exec['services']: c = cont_api.get(c) for port in c['ports']: if port['name'] == 'Spark application web interface': idle_time = check_spark_job(swarm, c['docker_id'], my_exec_since_started) if check_if_kill(idle_time): log.info( 'Execution {} for user {} has been idle for too long, terminating...' .format(my_exec['name'], guest['name'])) terminate = True break else: log.debug( 'Execution {} for user {} has been idle for {} seconds' .format(my_exec['name'], guest['name'], idle_time)) if terminate: break if terminate: exec_api.terminate(my_exec['id'])
def check_if_kill(idle_seconds): if idle_seconds > get_conf().spark_activity_timeout: return True else: return False
def container_died(zoe_id: int): log.debug('A container died') # tell the scheduler via the rest api cont_api = ZoeContainerAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password) cont_api.died(zoe_id)