def get_cluster_management_object(ml_cluster_name, kube_cluster=None): if kube_cluster is not None: cluster_name = kube_cluster.name else: record = MLClusterInfo.select().where( MLClusterInfo.namespace == ml_cluster_name).get() kube_cluster = KubeCluster.select().where( KubeCluster.id == record.clusterId).get() cluster_name = kube_cluster.name if cluster_name is not None: # cluster = KubeCluster.select().where(KubeCluster.name == cluster_name).get() cluster_master = MachinePool.select().where( (MachinePool.clusterId == kube_cluster.id) & (MachinePool.role == ClusterRole.getString( ClusterRole.MASTER.value))).get() master_machine = machine_executor(cluster_master) records = MachinePool.select().where( (MachinePool.clusterId == kube_cluster.id) & (MachinePool.role == MachinePool.role == ClusterRole.getString( ClusterRole.WORKER.value))) worker_machines = [] for record in records: executor = machine_executor(record) worker_machines.append(executor) cluster_management = get_cluster_management_object(kube_cluster.id) if cluster_management is None: cluster_management = ClusterManagement.get_cluster( kube_cluster.id, master_machine) return cluster_management
def get_kube_cluster_info(self, cluster_name): cluster = KubeCluster.select().where( KubeCluster.name == cluster_name)[0] records = MachinePool.select().where( MachinePool.clusterId == cluster.id) master_ip = None worker_ip_list = [] no_of_gpus = 0 for record in records: if record.role == ClusterRole.getString(ClusterRole.MASTER.value): master_ip = record.ipAddress else: if record.role == ClusterRole.getString( ClusterRole.WORKER.value): worker_ip_list.append(record.ipAddress) machine_pool_gpu = MachineGPUResourceInfo.select().where( MachineGPUResourceInfo.machinePool == record).get() no_of_gpus = no_of_gpus + int(machine_pool_gpu.dpuCount) return { NAME: cluster_name, MASTER_IP: master_ip, CLUSTER_WORKER_IP_LIST: worker_ip_list, GPU_COUNT: no_of_gpus, DATASET_VOLUME_MOUNT_POINT: DEFAULT_DATASET_MOUNT_PATH }
def delete_cluster(self, cluster_name, user_id=1): machines = MachinePool.select().where(MachinePool.name == cluster_name) cluster_management_object = ClusterManagement.get_cluster_management_object( cluster_name) executor = cluster_management_object.master_machine[EXECUTOR] executor.executeRemoteCommand("kubeadm reset --force") for worker in cluster_management_object.worker_machines: executor = worker[EXECUTOR] executor.executeRemoteCommand("kubeadm reset --force") cluster = KubeCluster.select().where( KubeCluster.name == cluster_name).get() records = MLClusterInfo.select().where( MLClusterInfo.clusterId == cluster.id) for record in records: GlusterFSVolume.delete().where( GlusterFSVolume.id == record.id).execute() record.delete() KubeCluster.delete().where(KubeCluster.name == cluster_name).execute() cluster_obj = GetObject.get_ml_object(cluster.frameworkType)
def get_cluster_object(payload): cluster_id = None if FRAMEWORK_RESOURCES in payload: dpu_type = payload[FRAMEWORK_RESOURCES][FRAMEWORK_ASSIGN_DPU_TYPE] dpuCount = payload[FRAMEWORK_RESOURCES][FRAMEWORK_DPU_COUNT] dpuCount = int(dpuCount) dpu = DPU.select().where(DPU.name == dpu_type).get() records = MachinePool.select().where( (MachinePool.role == ClusterRole.getString( ClusterRole.WORKER.value))) for record in records: machine_gpu_records = MachineGPUResourceInfo.select().where( MachineGPUResourceInfo.machinePool == record).get() dpu = int(machine_gpu_records.dpuCount) - int( machine_gpu_records.allocatedDPUCount) if dpu >= dpuCount: machine_gpu_records.allocatedDPUCount = machine_gpu_records.allocatedDPUCount + dpuCount cluster_id = record.clusterId machine_gpu_records.save() break else: record = MachinePool.select().where( MachinePool.role == ClusterRole.getString( ClusterRole.WORKER.value)).get() machine_gpu_record = MachineGPUResourceInfo.select().where( MachineGPUResourceInfo.machinePool == record).get() dpu = DPU.select().where(DPU.id == machine_gpu_record.dpuId).get() payload[FRAMEWORK_RESOURCES] = { FRAMEWORK_ASSIGN_DPU_TYPE: dpu.name, FRAMEWORK_DPU_COUNT: machine_gpu_record.dpuCount } cluster_id = record.clusterId machine_gpu_record.allocatedDPUCount = machine_gpu_record.dpuCount machine_gpu_record.save() if cluster_id is not None: if CLUSTER_NAME not in payload: record = KubeCluster.select().where( KubeCluster.id == cluster_id).get() payload[CLUSTER_NAME] = record.name payload[CLUSTER_ID] = cluster_id cluster_master = MachinePool.select().where( (MachinePool.clusterId == cluster_id) & (MachinePool.role == ClusterRole.getString( ClusterRole.MASTER.value))).get() cluster_management = get_cluster_management_object(cluster_id) if cluster_management is None: machine = machine_executor(cluster_master) cluster_management = ClusterManagement.get_cluster( cluster_id, machine) return cluster_management, payload
def get_cluster_info(self, ml_cluster_name, user_id): record = MLClusterInfo.select().where( (MLClusterInfo.namespace == ml_cluster_name) & (MLClusterInfo.userId == user_id)).get() cluster_management = ClusterManagement.get_cluster_management_object( ml_cluster_name) kube_cluster_name = KubeCluster.select().where( KubeCluster.id == record.clusterId).get().name kube_cluster_info = cluster_management.get_kube_cluster_info( kube_cluster_name) framework = GetObject.get_ml_object(record.type) mlClusterInfo = framework.get_cluster_info(ml_cluster_name, cluster_management, user_id) # mlClusterInfo.update({DATASET_VOLUME_MOUNT_POINT:DEFAULT_DATASET_MOUNT_PATH}) # DATASET_VOLUME_MOUNT_POINT:DEFAULT_DATASET_MOUNT_PATH, # DATASET_VOLUME_MOUNT_PATH_IN_POD_REST:DATASET_VOLUME_MOUNT_PATH_IN_POD return { KUBE_CLUSTER_INFO: kube_cluster_info, ML_CLUSTER_INFO: mlClusterInfo }
def create_cluster(payload): if USER_ID not in payload: payload[USER_ID] = DEFAULT_USER_ID cluster_management = ClusterManagement() cluster_name = payload[CLUSTER_NAME] master_ip = payload[CLUSTER_MASTER_IP] cluster = KubeCluster(name=cluster_name, userId=payload[USER_ID], status=StackStatus.getString( StackStatus.INPROGRESS.value), startTime=datetime.datetime.now()) cluster.save() master_machine = get_machine(master_ip, cluster.id) worker_ip_list = payload[CLUSTER_WORKER_IP_LIST] worker_machines = [] for worker_ip in worker_ip_list: machine = get_worker_machine(worker_ip, cluster.id) worker_machines.append(machine) log.debug("Worker machines {0},Master machines {1}".format( worker_machines, master_machine)) cluster_management.set_master_machines(master_machine) cluster_management.set_worker_machines(worker_machines) cluster_management.init_kube_cluster() cluster_management.get_kube_auth_token() log.debug("The auth token is {0}".format( cluster_management.kube_auth_token)) cluster_management.get_joining_token() log.debug("The joining token is {0}".format( cluster_management.joining_token)) cluster_management.get_kube_object() # self.start_heketi() cluster_management.create_storage_class() cluster_management.add_worker_nodes() cluster.status = StackStatus.getString(StackStatus.DONE.value) cluster.save() mount_gf_volume(cluster_management, DEFAULT_DATASET_VOLUME_NAME, DEFAULT_DATASET_MOUNT_PATH) push_cluster_management_object(cluster_name, cluster_management)
def get_cluster_id(cluster_name, user_id): cluster = KubeCluster.select().where((KubeCluster.name == cluster_name) & ( KubeCluster.userId == user_id)).get() return cluster
def get_cluster_info(self, cluster_name, user_id=1): cluster = KubeCluster.select().where( KubeCluster.name == cluster_name and KubeCluster.userId == user_id)[0] cluster_obj = GetObject.get_ml_object(cluster.frameworkType) return cluster_obj.get_cluster_info(cluster_name)