def api_image_run(sid: uuid.UUID, port: int): user: User = xtoken_user(AuthAPI.getXToken()) tag = randomString(15) db: wrappers.Collection = mongo.db.images image: Image = deserialize_json( Image, db.find_one({ "uid": user.uuid, "uuid": str(sid) })) try: container_id = DockerImageAPI.run(image.tag, "", port) container_uuid = str(uuid.uuid4()) container = Container(user.uuid, tag, "start", str(sid), port, container_id, container_uuid) db: wrappers.Collection = mongo.db.containers db.insert_one(container.__dict__) DockerContainerAPI.start(container) docker_daemon.notify(container_id) return json_result(0, "image run") except: return json_result(-1, "image not found")
def docker_run(sid: uuid.UUID): if request.method != "POST": return json_result(-1, "POST only") sid = str(sid) tag = request.form["tag"] sshport = int(request.form["sshport"]) uid = session.get("uuid") db: wrappers.Collection = mongo.db.images image: Image = deserialize_json(Image, db.find_one({ "uid": uid, "uuid": sid })) if not image: return json_result(-1, "Docker::Images::rmi failed") container_id = DockerImageAPI.run(tag, "", sshport) # image.run(tag, port=sshport) container_uuid = str(uuid.uuid4()) container = Container(uid, tag, "start", sid, sshport, container_id, container_uuid) db: wrappers.Collection = mongo.db.containers db.insert_one(container.__dict__) DockerContainerAPI.start(container) docker_daemon.notify(container_id) return json_result(0, "Successfully run")
def configure(): global status, models, containers, config logging.info("configuration started...") # read data data = request.get_json() config = ContainersManagerConfiguration(json_data=data) logging.info("configuration: %s", str(config.__dict__)) # build models and containers list # models if config.models: logging.info("Found %d models from configuration", len(config.models)) for model in config.models: m = Model(name=model["name"], version=model["version"], sla=model["sla"], alpha=model["alpha"], tfs_model_url=model["tfs_model_url"], initial_replicas=model["initial_replicas"]) if "profiled_rt" in model: m.profiled_rt = model["profiled_rt"] models.append(m) logging.info("+ %d models", len(models)) # containers if config.containers: logging.info("Found %d containers from configuration", len(config.containers)) for container in config.containers: containers.append( Container(container["model"], container["version"], container["active"], container["container"], container["node"], container["port"], container["device"], container["quota"])) logging.info( "+ %d CPU containers", len(list(filter(lambda m: m.device == Device.CPU, containers)))) logging.info( "+ %d GPU containers", len(list(filter(lambda m: m.device == Device.GPU, containers)))) logging.info([container.to_json() for container in containers]) status = "configured" logging.info(status) return {"result": "ok"}, 200
def start_requests_store(): global status, config, models, containers # get models information models = [Model(json_data=json_model) for json_model in get_data(config.models_endpoint)] logging.info("Models: %s", [model.to_json() for model in models]) # get containers information containers = [Container(json_data=json_container) for json_container in get_data(config.containers_endpoint)] logging.info("Containers: %s", [container.to_json() for container in containers]) status = "active" logging.info(status) return {"result": "ok"}, 200
def init(self): # get the models self.models = { json_model["name"]: Model(json_data=json_model) for json_model in self.get_data(self.models_endpoint) } log_str = "Loaded " + str(len(self.models)) + " models: " + str( [model.name for model in self.models.values()]) self.logs.append({ "ts": time.time(), "date": str(datetime.datetime.now()), "msg": log_str }) # get the containers self.containers = [ Container(json_data=json_container) for json_container in self.get_data(self.containers_endpoint) ] # group containers by nodes self.nodes = set(map(lambda c: c.node, self.containers)) self.containers_on_node = {} for node in self.nodes: self.containers_on_node[node] = list( filter(lambda c: c.node == node, self.containers)) log_str = "Containers by node: " + str([{ node: [c.to_json() for c in self.containers_on_node[node]] } for node in self.containers_on_node]) self.logs.append({ "ts": time.time(), "date": str(datetime.datetime.now()), "msg": log_str }) # init controllers self.controllers = [] t = time.time() for container in list( filter(lambda c: c.device == Device.CPU and c.active, self.containers)): c = Controller(container) c.next_action = t self.controllers.append(c)
def admin_containers(): containers = docker_api.container.getContainers() result = [] for container in containers: rcontainer: Container = Container() rcontainer.uid = "system" rcontainer.uuid = "system" rcontainer.short_id = container["Id"] rcontainer.tag = container["Image"] rcontainer.status = container["State"] r: Container = docker_api.container.find_by_shortid( rcontainer.short_id) if r != None: rcontainer.uid = r.uid rcontainer.uuid = r.uuid rcontainer.status = r.status result += [rcontainer] return render_template("/admin/container.html", containers=result)
def read_config_file(config_file): """ Read the configuration file and init the containers variable """ with open(config_file, 'r') as file: data = file.read() config = yaml.load(data, Loader=yaml.FullLoader) # models if config["models"]: logging.info("Found %d models", len(config["models"])) for model in config["models"]: if "profiled_rt" in model: models.append( Model(model["name"], model["version"], model["sla"], model["alpha"], model["profiled_rt"])) else: models.append( Model(model["name"], model["version"], model["sla"], model["alpha"])) logging.info("+ %d models", len(models)) # containers if config["containers"]: logging.info("Found %d containers", len(config["containers"])) for container in config["containers"]: containers.append( Container(container["model"], container["version"], container["active"], container["container"], container["node"], container["port"], container["device"], container["quota"])) logging.info( "+ %d CPU containers", len(list(filter(lambda m: m.device == Device.CPU, containers)))) logging.info( "+ %d GPU containers", len(list(filter(lambda m: m.device == Device.GPU, containers)))) logging.info([container.to_json() for container in containers])
def docker_build(): """ GET :param uid: user uuid POST :param tag: docker tag :parma dockfile: dockerfile uuid :param rootpass: root password for ssh :param sshport: ssh port forwarding build Dockerfile """ if request.method != "POST": return json_result(-1, "POST only") uid = session.get("uuid") username = session.get("username") tag = request.form["tag"] dockfile = request.form["dockfile"] rootpass = request.form["rootpass"] sshport = int(request.form["sshport"]) fn = "upload/{}/{}/Dockerfile".format(username, dockfile) with open(fn, "r") as f: df = f.read() name = tag.split(":")[0] ver = "latest" if len(tag.split(":")) == 1: ver = tag.split(":")[1] tag = randomString(20 - len(name)) + name + ":" + ver image_uuid = str(uuid.uuid4()) container_uuid = str(uuid.uuid4()) image = Image(uid, "", tag, "installing", sshport, "", image_uuid) db: wrappers.Collection = mongo.db.images db.insert_one(image.__dict__) # search Dockerfile df: wrappers.Collection = mongo.db.dockerfile result: Dockerfile = deserialize_json(Dockerfile, df.find_one({"uuid": dockfile})) if result == None: return json_result(-1, "Dockerfile is not exist") try: # image build image.status = "build" db.update({"uuid": image_uuid}, image.__dict__) result, imgs = DockerImageAPI.build(result.path, rootpass, tag) image.short_id = imgs[0]["Id"].split(":")[1] print(result) image.status = "done" db.update({"uuid": image_uuid}, image.__dict__) except: image.status = "fail" db.update({"uuid": image_uuid}, image.__dict__) return json_result(-1, "Dockerfile::Image::build fail") # container start container_id = DockerImageAPI.run(tag, "", sshport) # image.run(tag, port=sshport) container = Container(uid, tag, "start", image_uuid, sshport, container_id, container_uuid) container.start(container_id) db: wrappers.Collection = mongo.db.containers db.insert_one(container.__dict__) docker_daemon.notify(container_id) result_stream = [] for item in result: try: result_stream += [item["stream"]] except: continue return json_result(0, "".join(result_stream))
def configure(): global status, active, reqs_queues, config if not config: logging.info("reading config from file") if not read_config_from_file(): logging.error("configuration reading error") return False else: logging.info("configuration read from file") logging.info("configuration read: " + str(config.__dict__)) logging.info("Getting models from: %s", config.models_endpoint) logging.info("Getting containers from: %s", config.containers_endpoint) # init models models = [ Model(json_data=json_model) for json_model in get_data(config.models_endpoint) ] if len(models) > 0: logging.info("Models: %s", [model.to_json() for model in models]) else: logging.warning("No models found") # init containers containers = [ Container(json_data=json_container) for json_container in get_data(config.containers_endpoint) ] if len(containers) > 0: logging.info("Containers: %s", [container.to_json() for container in containers]) else: logging.warning("No containers found") logging.info("Found %d models and %d containers", len(models), len(containers)) # init requests queues reqs_queues = {model.name: queue.Queue() for model in models} responses_list = {model.name: [] for model in models} # init policy queues_policies = QueuesPolicies(reqs_queues, responses_list, models, logging) gpu_policy = queues_policies.policies.get(config.gpu_queues_policy) cpu_policy = queues_policies.policies.get(config.cpu_queues_policy) logging.info("Policy for GPUs: %s", config.gpu_queues_policy) logging.info("Policy for CPUs: %s", config.cpu_queues_policy) # disable logging if verbose == 0 logging.info("Verbose: %d", config.verbose) if config.verbose == 0: app.logger.disabled = True logging.getLogger('werkzeug').setLevel(logging.WARNING) # init dispatchers status = "Init dispatchers" logging.info(status) dispatcher_gpu = Dispatcher(app.logger, models, containers, DispatchingPolicy.ROUND_ROBIN, Device.GPU) dispatcher_cpu = Dispatcher(app.logger, models, containers, DispatchingPolicy.ROUND_ROBIN, Device.CPU) # start the send requests thread status = "Start send reqs thread" logging.info(status) log_consumer_threads_pool = ThreadPoolExecutor( max_workers=config.max_log_consumers) for i in range(config.max_log_consumers): log_consumer_threads_pool.submit(log_consumer) # start the queues consumer threads status = "Start queues consumer threads" logging.info(status) if list(filter(lambda c: c.device == Device.GPU and c.active, containers)): # threads that pools from the apps queues and dispatch to gpus polling_gpu_threads_pool = ThreadPoolExecutor( max_workers=config.max_polling_threads) for i in range(config.max_polling_threads): polling_gpu_threads_pool.submit(queues_pooling, dispatcher_gpu, gpu_policy, config.max_consumers_gpu) if list(filter(lambda c: c.device == Device.CPU and c.active, containers)): # threads that pools from the apps queues and dispatch to cpus pooling_cpu_threads_pool = ThreadPoolExecutor( max_workers=config.max_polling_threads) for i in range(config.max_polling_threads): pooling_cpu_threads_pool.submit(queues_pooling, dispatcher_cpu, cpu_policy, config.max_consumers_cpu) status = "active" active = True logging.info(status) return True
def create_app( containers_manager="http://localhost:5001", requests_store="http://localhost:5002", verbose=1, gpu_queues_policy=QueuesPolicy.HEURISTIC_1, cpu_queues_policy=QueuesPolicy.ROUND_ROBIN, max_log_consumers=1, max_polling=1, # the number of threads waiting for requests max_consumers_cpu=100, max_consumers_gpu=100): # the number of concurrent threads requests global reqs_queues, requests_store_host, status, gpu_policy, cpu_policy, responses_list requests_store_host = requests_store + "/requests" # init log coloredlogs.install(level='DEBUG', milliseconds=True) # log_format = "%(asctime)s:%(levelname)s:%(name)s: %(filename)s:%(lineno)d:%(message)s" # logging.basicConfig(level='DEBUG', format=log_format) # init models and containers status = "Init models and containers" logging.info(status) models_endpoint = containers_manager + "/models" containers_endpoint = containers_manager + "/containers" logging.info("Getting models from: %s", models_endpoint) logging.info("Getting containers from: %s", containers_endpoint) models = [ Model(json_data=json_model) for json_model in get_data(models_endpoint) ] logging.info("Models: %s", [model.to_json() for model in models]) containers = [ Container(json_data=json_container) for json_container in get_data(containers_endpoint) ] logging.info("Containers: %s", [container.to_json() for container in containers]) logging.info("Found %d models and %d containers", len(models), len(containers)) # init reqs queues reqs_queues = {model.name: queue.Queue() for model in models} responses_list = {model.name: [] for model in models} # init policy queues_policies = QueuesPolicies(reqs_queues, responses_list, models, logging) gpu_policy = queues_policies.policies.get(gpu_queues_policy) cpu_policy = queues_policies.policies.get(cpu_queues_policy) logging.info("Policy for GPUs: %s", gpu_queues_policy) logging.info("Policy for CPUs: %s", cpu_queues_policy) # disable logging if verbose == 0 logging.info("Verbose: %d", verbose) if verbose == 0: app.logger.disabled = True logging.getLogger('werkzeug').setLevel(logging.WARNING) # init dispatchers status = "Init dispatchers" logging.info(status) dispatcher_gpu = Dispatcher(app.logger, models, containers, DispatchingPolicy.ROUND_ROBIN, Device.GPU) dispatcher_cpu = Dispatcher(app.logger, models, containers, DispatchingPolicy.ROUND_ROBIN, Device.CPU) # start the send requests thread status = "Start send reqs thread" logging.info(status) log_consumer_threads_pool = ThreadPoolExecutor( max_workers=max_log_consumers) for i in range(max_log_consumers): log_consumer_threads_pool.submit(log_consumer) # start the queues consumer threads status = "Start queues consumer threads" logging.info(status) if list(filter(lambda c: c.device == Device.GPU and c.active, containers)): # threads that pools from the apps queues and dispatch to gpus polling_gpu_threads_pool = ThreadPoolExecutor(max_workers=max_polling) for i in range(max_polling): polling_gpu_threads_pool.submit(queues_pooling, dispatcher_gpu, gpu_policy, max_consumers_gpu) if list(filter(lambda c: c.device == Device.CPU and c.active, containers)): # threads that pools from the apps queues and dispatch to cpus pooling_cpu_threads_pool = ThreadPoolExecutor(max_workers=max_polling) for i in range(max_polling): pooling_cpu_threads_pool.submit(queues_pooling, dispatcher_cpu, cpu_policy, max_consumers_cpu) # start status = "Running" logging.info(status) return app
status = "running" parser = argparse.ArgumentParser() parser.add_argument('--containers_manager', type=str, required=True) args = parser.parse_args() # init log log_format = "%(asctime)s:%(levelname)s:%(name)s:" \ "%(filename)s:%(lineno)d:%(message)s" logging.basicConfig(level='DEBUG', format=log_format) # get models information models_endpoint = args.containers_manager + "/models" logging.info("Getting models from: %s", models_endpoint) models = [ Model(json_data=json_model) for json_model in get_data(models_endpoint) ] logging.info("Models: %s", [model.to_json() for model in models]) # get containers information containers_endpoint = args.containers_manager + "/containers" logging.info("Getting containers from: %s", containers_endpoint) containers = [ Container(json_data=json_container) for json_container in get_data(containers_endpoint) ] logging.info("Containers: %s", [container.to_json() for container in containers]) app.run(host='0.0.0.0', port=5002)
def k8s_deployment_generator(k8s_config: K8sConfiguration): # add containers containers = [] k8s_containers = [] # add actuator container k8s_container = client.V1Container( name="nodemanager-actuator", image=k8s_config.actuator_image, ports=[ client.V1ContainerPort(container_port=k8s_config.actuator_port) ], volume_mounts=[ client.V1VolumeMount(name="docker-sock", mount_path="/var/run") ], image_pull_policy=k8s_config.k8s_image_pull_policy) k8s_containers.append(k8s_container) # add CPU containers base_port = 8501 for i, model in enumerate( ConfigurationsGenerator.model_list(k8s_config.models)): container_name = "nodemanager-rest-cpu-" + str(i + 1) k8s_container = client.V1Container( name=container_name, image=k8s_config.tfs_image, args=[ "--model_config_file=" + k8s_config.tfs_config_file_name, "--rest_api_port=" + str(base_port) ], ports=[client.V1ContainerPort(container_port=base_port)], volume_mounts=[ client.V1VolumeMount(name="shared-models", mount_path=k8s_config.tfs_models_path) ]) k8s_containers.append(k8s_container) containers.append( Container(model=model.name, version=model.version, active=False, container=container_name, node=None, port=base_port, device=Device.CPU, quota=None)) base_port += 1 # add GPU containers for gpu in range(k8s_config.available_gpus): container_name = "nodemanager-rest-gpu-" + str(gpu + 1) k8s_container = client.V1Container( name=container_name, image=k8s_config.tfs_image + "-gpu", args=[ "--model_config_file=" + k8s_config.tfs_config_file_name, "--rest_api_port=" + str(base_port) ], ports=[client.V1ContainerPort(container_port=base_port)], volume_mounts=[ client.V1VolumeMount(name="shared-models", mount_path=k8s_config.tfs_models_path) ], env=[ client.V1EnvVar(name="NVIDIA_VISIBLE_DEVICES", value=str(gpu + 1)) ]) k8s_containers.append(k8s_container) containers.append( Container(model="all", version=1, active=False, container=container_name, node=None, port=base_port, device=Device.GPU, quota=None)) base_port += 1 # add volumes volumes = [ client.V1Volume( name="docker-sock", host_path=client.V1HostPathVolumeSource(path="/var/run")), client.V1Volume(name="shared-models", empty_dir=client.V1EmptyDirVolumeSource()) ] # set pod affinity affinity = client.V1Affinity(pod_anti_affinity=client.V1PodAffinity( required_during_scheduling_ignored_during_execution=[ client.V1PodAffinityTerm(topology_key="kubernetes.io/hostname") ])) # init containers init_containers = [] for i, model in enumerate( ConfigurationsGenerator.model_list(k8s_config.models)): container_name = "tfs-init-" + str(i + 1) init_containers.append( client.V1Container( name=container_name, image=k8s_config.tfs_init_image, args=[ "-f", "/home/models/", "-d", "/home/models/" + model.name, "-c", k8s_config.tfs_config_endpoint, "-m", model.tfs_model_url ], image_pull_policy=k8s_config.k8s_image_pull_policy, volume_mounts=[ client.V1VolumeMount( name="shared-models", mount_path=k8s_config.tfs_models_path) ])) # add pod spec pod_spec = client.V1PodSpec(containers=k8s_containers, volumes=volumes, affinity=affinity, init_containers=init_containers, host_network=k8s_config.k8s_host_network, dns_policy="Default") # add pod template spec pod_template_spec = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={"run": "nodemanager"}), spec=pod_spec) # add deployment spec deployment_spec = client.V1DeploymentSpec( selector=client.V1LabelSelector( match_labels={"run": "nodemanager"}), template=pod_template_spec, replicas=k8s_config.initial_replicas) # build deployment deployment = client.V1Deployment(api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta( name="nodemanager-deploy", labels={"run": "nodemanager"}), spec=deployment_spec) return containers, deployment
def __init__(self, name: str, element_set: ElementSet): self._element_set = element_set self._name = name self._container = Container(element_set) self._fill()