def test_create_pool_existing(self): pool = pool_api.create_pool(name=self.pools[0].pool, slots=5, description='', session=self.session) self.assertEqual(pool.pool, self.pools[0].pool) self.assertEqual(pool.slots, 5) self.assertEqual(pool.description, '') self.assertEqual(self.session.query(models.Pool).count(), 2)
def test_create_pool(self): pool = pool_api.create_pool(name='foo', slots=5, description='') self.assertEqual(pool.pool, 'foo') self.assertEqual(pool.slots, 5) self.assertEqual(pool.description, '') with create_session() as session: self.assertEqual(session.query(models.Pool).count(), 3)
def test_create_pool_existing(self): pool = pool_api.create_pool(name=self.pools[0].pool, slots=5, description='', session=self.session) self.assertEqual(pool.pool, self.pools[0].pool) self.assertEqual(pool.slots, 5) self.assertEqual(pool.description, '') self.assertEqual( self.session.query(models.Pool).count(), self.TOTAL_POOL_COUNT)
def execute(self, context): try: pool = get_pool(name=self.name) if pool: self.log(f'Pool exists: {pool}') return except PoolNotFound: pool = create_pool(name=self.name, slots=self.slots, description=self.description) self.log(f'Created pool: {pool}')
def create_pool(): """Create a pool.""" params = request.get_json(force=True) try: pool = pool_api.create_pool(**params) except AirflowException as err: log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response else: return jsonify(pool.to_json())
def create_pool(): """Create a pool.""" params = request.get_json(force=True) try: pool = pool_api.create_pool(**params) except AirflowException as e: _log.error(e) response = jsonify(error="{}".format(e)) response.status_code = getattr(e, 'status', 500) return response else: return jsonify(pool.to_json())
def init_pools(): pools = pool_api.get_pools() pools = [i.pool for i in pools] if "MEMORY" not in pools: print() print("Get node resources...") print() node_memory = get_node_info(query=memory_query) node_cpu = get_node_info(query=cpu_core_query) node_gpu_mem = get_node_info(query=gpu_mem_available_query) node_gpu_count = get_node_info(query=gpu_query) try: print("+++++++++++++++++++++++++++++++++++++++++++++ CREATING MEMORY POOL") pool_api.create_pool( name="MEMORY", slots=abs(node_memory - 10000), description="Memory of the node in MB" ) # Variable.set("mem_alloc", NodeUtil.mem_alloc) print("+++++++++++++++++++++++++++++++++++++++++++++ CREATING CPU POOL") pool_api.create_pool( name="CPU", slots=node_cpu, description="Count of CPU-cores of the node" ) # Variable.set("cpu_alloc", NodeUtil.cpu_alloc) print("+++++++++++++++++++++++++++++++++++++++++++++ CREATING GPU POOL") pool_api.create_pool( name="GPU_MEM", slots=node_gpu_mem, description="Memory of all GPUs of the node in MB" ) pool_api.create_pool( name="GPU_COUNT", slots=node_gpu_count, description="Count of GPUs of the node" ) # Variable.set("gpu_alloc", NodeUtil.gpu_alloc) # Variable.set("gpu_count", NodeUtil.gpu_count) except Exception as e: print("++++++++++++++++++++++++++++++++++++++ Error @ creating pools!") print(e) exit(1)
def create_pool(self, name, slots, description): the_pool = pool.create_pool(name=name, slots=slots, description=description) return the_pool.pool, the_pool.slots, the_pool.description
def compute_allocated_resources(logger=None): Q_ = NodeUtil.ureg.Quantity data = {} NodeUtil.last_update = datetime.now() try: for node in NodeUtil.core_v1.list_node().items: stats = {} node_name = node.metadata.name capacity = node.status.capacity allocatable = node.status.allocatable conditions = node.status.conditions stats["memory_pressure"] = False stats["disk_pressure"] = False stats["pid_pressure"] = False for condition in conditions: if condition.type == "MemoryPressure": stats[ "memory_pressure"] = True if condition.status == "True" else False elif condition.type == "DiskPressure": stats[ "disk_pressure"] = True if condition.status == "True" else False elif condition.type == "PIDPressure": stats[ "pid_pressure"] = True if condition.status == "True" else False max_pods = int(int(allocatable["pods"]) * 1.5) field_selector = ( "status.phase!=Succeeded,status.phase!=Failed," + "spec.nodeName=" + node_name) stats["cpu_alloc"] = Q_(allocatable["cpu"]) stats["mem_alloc"] = Q_(allocatable["memory"]) stats["gpu_dev_count"] = Q_( capacity["nvidia.com/gpu"] if "nvidia.com/gpu" in capacity else 0) stats["gpu_dev_free"] = Q_( allocatable["nvidia.com/gpu"] if "nvidia.com/gpu" in allocatable else 0) pods = NodeUtil.core_v1.list_pod_for_all_namespaces( limit=max_pods, field_selector=field_selector).items # compute the allocated resources cpureqs, cpulmts, memreqs, memlmts = [], [], [], [] for pod in pods: for container in pod.spec.containers: res = container.resources reqs = defaultdict(lambda: 0, res.requests or {}) lmts = defaultdict(lambda: 0, res.limits or {}) cpureqs.append(Q_(reqs["cpu"])) memreqs.append(Q_(reqs["memory"])) cpulmts.append(Q_(lmts["cpu"])) memlmts.append(Q_(lmts["memory"])) stats["cpu_req"] = sum(cpureqs) stats["cpu_lmt"] = sum(cpulmts) stats["cpu_req_per"] = (stats["cpu_req"] / stats["cpu_alloc"] * 100) stats["cpu_lmt_per"] = (stats["cpu_lmt"] / stats["cpu_alloc"] * 100) stats["mem_req"] = sum(memreqs) stats["mem_lmt"] = sum(memlmts) stats["mem_req_per"] = (stats["mem_req"] / stats["mem_alloc"] * 100) stats["mem_lmt_per"] = (stats["mem_lmt"] / stats["mem_alloc"] * 100) data[node_name] = stats node_info = next(iter(data.values())) NodeUtil.cpu_alloc = node_info["cpu_alloc"].to_base_units( ).magnitude * 1000 NodeUtil.cpu_req = node_info["cpu_req"].to_base_units( ).magnitude * 1000 NodeUtil.cpu_lmt = node_info["cpu_lmt"].to_base_units( ).magnitude * 1000 NodeUtil.cpu_req_per = int( node_info["cpu_req_per"].to_base_units().magnitude * 1000) NodeUtil.cpu_lmt_per = int( node_info["cpu_lmt_per"].to_base_units().magnitude * 1000) NodeUtil.mem_alloc = int( node_info["mem_alloc"].to_base_units().magnitude // 1024 // 1024) NodeUtil.mem_req = int( node_info["mem_req"].to_base_units().magnitude // 1024 // 1024) NodeUtil.mem_lmt = int( node_info["mem_lmt"].to_base_units().magnitude // 1024 // 1024) NodeUtil.mem_req_per = int( node_info["mem_req_per"].to_base_units().magnitude) NodeUtil.mem_lmt_per = int( node_info["mem_lmt_per"].to_base_units().magnitude) NodeUtil.gpu_dev_count = int(node_info["gpu_dev_count"]) NodeUtil.gpu_dev_free = int(node_info["gpu_dev_free"]) NodeUtil.memory_pressure = node_info["memory_pressure"] NodeUtil.disk_pressure = node_info["disk_pressure"] NodeUtil.pid_pressure = node_info["pid_pressure"] gpu_count_pool = pool_api.get_pool(name="GPU_COUNT") if gpu_count_pool is None or gpu_count_pool.slots != NodeUtil.gpu_dev_count: pool_api.create_pool(name="GPU_COUNT", slots=NodeUtil.gpu_dev_count, description="Count of GPUs of the node") if NodeUtil.gpu_dev_count > 0: NodeUtil.gpu_mem_alloc, return_code = NodeUtil.get_node_info( query=NodeUtil.gpu_mem_available_query, logger=logger) if not return_code and logger is not None: logger.warning( "############################################# Could not fetch gpu_alloc utilization from prometheus!!" ) NodeUtil.gpu_mem_alloc = None NodeUtil.gpu_mem_used, return_code = NodeUtil.get_node_info( query=NodeUtil.gpu_mem_used_query, logger=logger) if not return_code and logger is not None: logger.warning( "############################################# Could not fetch gpu_used utilization from prometheus!!" ) NodeUtil.gpu_mem_used = None else: NodeUtil.gpu_mem_alloc = 0 NodeUtil.gpu_mem_used = 0 NodeUtil.cpu_available_req = NodeUtil.cpu_alloc - NodeUtil.cpu_req NodeUtil.cpu_available_limit = NodeUtil.cpu_alloc - NodeUtil.cpu_lmt NodeUtil.memory_available_req = NodeUtil.mem_alloc - NodeUtil.mem_req NodeUtil.memory_available_limit = NodeUtil.mem_alloc - NodeUtil.mem_lmt NodeUtil.gpu_memory_available = None if ( NodeUtil.gpu_mem_alloc is None or NodeUtil.gpu_mem_used is None ) else (NodeUtil.gpu_mem_alloc - NodeUtil.gpu_mem_used) Variable.set("CPU_NODE", "{}/{}".format(NodeUtil.cpu_lmt, NodeUtil.cpu_alloc)) Variable.set("CPU_FREE", "{}".format(NodeUtil.cpu_available_req)) Variable.set("RAM_NODE", "{}/{}".format(NodeUtil.mem_req, NodeUtil.mem_alloc)) Variable.set("RAM_FREE", "{}".format(NodeUtil.memory_available_req)) Variable.set( "GPU_DEV", "{}/{}".format(NodeUtil.gpu_dev_free, NodeUtil.gpu_dev_count)) Variable.set("GPU_DEV_FREE", "{}".format(NodeUtil.gpu_dev_free)) Variable.set( "GPU_MEM", "{}/{}".format(NodeUtil.gpu_mem_used, NodeUtil.gpu_mem_alloc)) Variable.set("GPU_MEM_FREE", "{}".format(NodeUtil.gpu_memory_available)) Variable.set("UPDATED", datetime.utcnow()) # Variable.set("cpu_alloc", "{}".format(NodeUtil.cpu_alloc)) # Variable.set("cpu_req", "{}".format(NodeUtil.cpu_req)) # Variable.set("cpu_lmt", "{}".format(NodeUtil.cpu_lmt)) # Variable.set("cpu_req_per", "{}".format(NodeUtil.cpu_req_per)) # Variable.set("cpu_lmt_per", "{}".format(NodeUtil.cpu_lmt_per)) # Variable.set("cpu_available_req", "{}".format(NodeUtil.cpu_available_req)) # Variable.set("cpu_available_limit", "{}".format(NodeUtil.cpu_available_limit)) # Variable.set("mem_alloc", "{}".format(NodeUtil.mem_alloc)) # Variable.set("mem_req", "{}".format(NodeUtil.mem_req)) # Variable.set("mem_lmt", "{}".format(NodeUtil.mem_lmt)) # Variable.set("mem_req_per", "{}".format(NodeUtil.mem_req_per)) # Variable.set("mem_lmt_per", "{}".format(NodeUtil.mem_lmt_per)) # Variable.set("memory_available_req", "{}".format(NodeUtil.memory_available_req)) # Variable.set("memory_available_limit", "{}".format(NodeUtil.memory_available_limit)) # Variable.set("gpu_count", "{}".format(NodeUtil.gpu_count)) # Variable.set("gpu_alloc", "{}".format(NodeUtil.gpu_alloc)) # Variable.set("gpu_used", "{}".format(NodeUtil.gpu_used)) # Variable.set("gpu_memory_available", "{}".format(NodeUtil.gpu_memory_available)) return True except Exception as e: print( "+++++++++++++++++++++++++++++++++++++++++ COULD NOT FETCH NODES!" ) print(e) return False
def create_pool(self, name, slots, description): p = pool.create_pool(name=name, slots=slots, description=description) return p.pool, p.slots, p.description
def create_pools(): create_pool("default_pool", 128, "default pool for tasks") create_pool("large_pool", 1024, "large pool to handle tasks parallelism surgically")