def __init__(self): self.async_client = motor.motor_asyncio.AsyncIOMotorClient( username=config.DB_USERNAME, password=config.get_from_file(config.MONGO_KEY_PATH), host=config.MONGO_HOST, port=config.get_int("MONGO_PORT", 27016)) self.reg_client = pymongo.MongoClient( username=config.DB_USERNAME, password=config.get_from_file(config.MONGO_KEY_PATH), host=config.MONGO_HOST, port=config.get_int("MONGO_PORT", 27016)) self.init_db()
async def init(cls, docker_client, redis, session, autoscale_worker, autoscale_app, autoheal_worker, autoheal_apps): self = cls(docker_client, redis, session, autoscale_worker, autoscale_app, autoheal_worker, autoheal_apps) # await redis.flushall() # TODO: do a more targeted cleanup of redis self.app_repo = await AppRepo.create(config.APPS_PATH, session) self.running_apps = await self.get_running_apps() self.worker = await get_service(self.docker_client, static.WORKER_SERVICE) services = await self.docker_client.services.list() self.service_replicas = { s["Spec"]["Name"]: (await get_replicas(self.docker_client, s["ID"])) for s in services } self.max_workers = config.get_int("MAX_WORKER_REPLICAS", 10) try: await self.redis.xgroup_create(static.REDIS_WORKFLOW_QUEUE, static.REDIS_WORKFLOW_GROUP, mkstream=True) logger.info( f"Created {static.REDIS_WORKFLOW_QUEUE} stream and {static.REDIS_WORKFLOW_GROUP} group." ) except aioredis.errors.BusyGroupError: logger.info( f"{static.REDIS_WORKFLOW_QUEUE} stream already exists.") if len(self.app_repo.apps) < 1: logger.error( "Walkoff must be loaded with at least one app. Please check that applications dir exists." ) exit(1) return self
async def monitor_queues(self): # count = 0 while True: services = await self.docker_client.services.list() self.service_replicas = { s["Spec"]["Name"]: (await get_replicas(self.docker_client, s["ID"])) for s in services } if self.autoscale_worker: await self.scale_worker() if self.autoscale_app: await self.scale_app() if self.autoheal_apps: await self.check_pending_actions() # Reload the app projects and apis every once in a while # if count * config.get_int("UMPIRE_HEARTBEAT", 1) >= config.get_int("APP_REFRESH", 60): # count = 0 # logger.info("Refreshing apps.") # # TODO: maybe do this a bit more intelligently? Presently it throws uniqueness errors for db # await self.app_repo.load_apps_and_apis() # await self.app_repo.delete_unused_apps_and_apis() # await asyncio.sleep(config.get_int("UMPIRE_HEARTBEAT", 1))
def __init__(self): self.host = config.get_str("DB", "host") self.user = config.get_str("DB", "user") self.password = config.get_str("DB", "password") self.db = config.get_str("DB", "db") self.port = config.get_int("DB", "port") self.charset = config.get_str("DB", "charset") self.conn = None self.cur = None
async def scale_app(self): self.running_apps = await self.get_running_apps() logger.debug( f"Running apps: {[{s: self.service_replicas.get(s)['running']} for s in self.running_apps.keys()]}") streams = [key.split(':') for key in await self.redis.keys(pattern=UUID_GLOB + ":*:*", encoding="utf-8")] workloads = {f"{app_name}:{version}": {"total": 0, "queued": 0, "executing": 0} for _, app_name, version in streams} if len(streams) > 0: for execution_id, app_name, version in streams: stream = f"{execution_id}:{app_name}:{version}" group = f"{app_name}:{version}" try: executing_work = (await self.redis.xpending(stream=stream, group_name=group))[0] total_work = await xlen(self.redis, stream) except aioredis.ReplyError: continue # the group or stream got closed while we were checking other streams queued_work = total_work - executing_work workloads[group]["executing"] += executing_work workloads[group]["queued"] += queued_work workloads[group]["total"] += total_work service_name = f"{static.APP_PREFIX}_{app_name}" curr_replicas = self.service_replicas.get(service_name, {"running": 0, "desired": 0})["desired"] max_replicas = config.get_int("MAX_APP_REPLICAS", 10) replicas_needed = min(total_work, max_replicas) logger.debug(f"Total work: {total_work}") logger.debug(f"queued: {total_work}") logger.debug(f"Needed replicas: {replicas_needed}") logger.debug(f"Current replicas: {curr_replicas}") if replicas_needed > curr_replicas: logger.info(f"Launching app {':'.join([service_name, version])}") if replicas_needed > curr_replicas > 0: await self.launch_app(service_name, version, replicas_needed) elif replicas_needed > curr_replicas == 0: # scale to 0 and restart await self.launch_app(service_name, version, 0) await self.launch_app(service_name, version, replicas_needed) for service_name, workload in workloads.items(): logger.debug(f"Queued actions for {service_name}: {workload['queued']}") logger.debug(f"Executing actions for {service_name}: {workload['executing']}")
async def get_workflow(redis: aioredis.Redis): """ Continuously monitors the workflow queue for new work """ while True: logger.info("Waiting for workflows...") # if static.CONTAINER_ID is None: # logger.exception("Environment variable 'HOSTNAME' does not exist in worker container.") # sys.exit(-1) try: message = await redis.xread_group( static.REDIS_WORKFLOW_GROUP, static.CONTAINER_ID, streams=[static.REDIS_WORKFLOW_QUEUE], latest_ids=['>'], timeout=config.get_int("WORKER_TIMEOUT", 30) * 1000, count=1) except aioredis.ReplyError as e: logger.error(f"Error reading from workflow queue: {e}.") sys.exit(-1) if len( message ) < 1: # We've timed out with no work. Guess we'll die now... sys.exit(1) execution_id_workflow, stream, id_ = deref_stream_message(message) execution_id, workflow = execution_id_workflow try: if not (await redis.sismember(static.REDIS_ABORTING_WORKFLOWS, execution_id)): await redis.sadd(static.REDIS_EXECUTING_WORKFLOWS, execution_id) yield workflow_loads(workflow) except Exception as e: logger.exception(e) finally: # Clean up workflow-queue await redis.xack(stream=stream, group_name=static.REDIS_WORKFLOW_GROUP, id=id_) await xdel(redis, stream=stream, id_=id_)