def get_cluster_management_object(ml_cluster_name, kube_cluster=None):
        if kube_cluster is not None:
            cluster_name = kube_cluster.name
        else:
            record = MLClusterInfo.select().where(
                MLClusterInfo.namespace == ml_cluster_name).get()
            kube_cluster = KubeCluster.select().where(
                KubeCluster.id == record.clusterId).get()
            cluster_name = kube_cluster.name
        if cluster_name is not None:
            # cluster = KubeCluster.select().where(KubeCluster.name == cluster_name).get()
            cluster_master = MachinePool.select().where(
                (MachinePool.clusterId == kube_cluster.id)
                & (MachinePool.role == ClusterRole.getString(
                    ClusterRole.MASTER.value))).get()
            master_machine = machine_executor(cluster_master)

        records = MachinePool.select().where(
            (MachinePool.clusterId == kube_cluster.id)
            & (MachinePool.role == MachinePool.role == ClusterRole.getString(
                ClusterRole.WORKER.value)))
        worker_machines = []
        for record in records:
            executor = machine_executor(record)
            worker_machines.append(executor)
        cluster_management = get_cluster_management_object(kube_cluster.id)
        if cluster_management is None:
            cluster_management = ClusterManagement.get_cluster(
                kube_cluster.id, master_machine)
        return cluster_management
 def get_kube_cluster_info(self, cluster_name):
     cluster = KubeCluster.select().where(
         KubeCluster.name == cluster_name)[0]
     records = MachinePool.select().where(
         MachinePool.clusterId == cluster.id)
     master_ip = None
     worker_ip_list = []
     no_of_gpus = 0
     for record in records:
         if record.role == ClusterRole.getString(ClusterRole.MASTER.value):
             master_ip = record.ipAddress
         else:
             if record.role == ClusterRole.getString(
                     ClusterRole.WORKER.value):
                 worker_ip_list.append(record.ipAddress)
                 machine_pool_gpu = MachineGPUResourceInfo.select().where(
                     MachineGPUResourceInfo.machinePool == record).get()
                 no_of_gpus = no_of_gpus + int(machine_pool_gpu.dpuCount)
     return {
         NAME: cluster_name,
         MASTER_IP: master_ip,
         CLUSTER_WORKER_IP_LIST: worker_ip_list,
         GPU_COUNT: no_of_gpus,
         DATASET_VOLUME_MOUNT_POINT: DEFAULT_DATASET_MOUNT_PATH
     }
    def delete_cluster(self, cluster_name, user_id=1):
        machines = MachinePool.select().where(MachinePool.name == cluster_name)
        cluster_management_object = ClusterManagement.get_cluster_management_object(
            cluster_name)
        executor = cluster_management_object.master_machine[EXECUTOR]
        executor.executeRemoteCommand("kubeadm reset --force")
        for worker in cluster_management_object.worker_machines:
            executor = worker[EXECUTOR]
            executor.executeRemoteCommand("kubeadm reset --force")

        cluster = KubeCluster.select().where(
            KubeCluster.name == cluster_name).get()
        records = MLClusterInfo.select().where(
            MLClusterInfo.clusterId == cluster.id)
        for record in records:
            GlusterFSVolume.delete().where(
                GlusterFSVolume.id == record.id).execute()
            record.delete()

        KubeCluster.delete().where(KubeCluster.name == cluster_name).execute()
        cluster_obj = GetObject.get_ml_object(cluster.frameworkType)
 def get_cluster_object(payload):
     cluster_id = None
     if FRAMEWORK_RESOURCES in payload:
         dpu_type = payload[FRAMEWORK_RESOURCES][FRAMEWORK_ASSIGN_DPU_TYPE]
         dpuCount = payload[FRAMEWORK_RESOURCES][FRAMEWORK_DPU_COUNT]
         dpuCount = int(dpuCount)
         dpu = DPU.select().where(DPU.name == dpu_type).get()
         records = MachinePool.select().where(
             (MachinePool.role == ClusterRole.getString(
                 ClusterRole.WORKER.value)))
         for record in records:
             machine_gpu_records = MachineGPUResourceInfo.select().where(
                 MachineGPUResourceInfo.machinePool == record).get()
             dpu = int(machine_gpu_records.dpuCount) - int(
                 machine_gpu_records.allocatedDPUCount)
             if dpu >= dpuCount:
                 machine_gpu_records.allocatedDPUCount = machine_gpu_records.allocatedDPUCount + dpuCount
                 cluster_id = record.clusterId
                 machine_gpu_records.save()
                 break
     else:
         record = MachinePool.select().where(
             MachinePool.role == ClusterRole.getString(
                 ClusterRole.WORKER.value)).get()
         machine_gpu_record = MachineGPUResourceInfo.select().where(
             MachineGPUResourceInfo.machinePool == record).get()
         dpu = DPU.select().where(DPU.id == machine_gpu_record.dpuId).get()
         payload[FRAMEWORK_RESOURCES] = {
             FRAMEWORK_ASSIGN_DPU_TYPE: dpu.name,
             FRAMEWORK_DPU_COUNT: machine_gpu_record.dpuCount
         }
         cluster_id = record.clusterId
         machine_gpu_record.allocatedDPUCount = machine_gpu_record.dpuCount
         machine_gpu_record.save()
     if cluster_id is not None:
         if CLUSTER_NAME not in payload:
             record = KubeCluster.select().where(
                 KubeCluster.id == cluster_id).get()
             payload[CLUSTER_NAME] = record.name
         payload[CLUSTER_ID] = cluster_id
         cluster_master = MachinePool.select().where(
             (MachinePool.clusterId == cluster_id)
             & (MachinePool.role == ClusterRole.getString(
                 ClusterRole.MASTER.value))).get()
         cluster_management = get_cluster_management_object(cluster_id)
         if cluster_management is None:
             machine = machine_executor(cluster_master)
             cluster_management = ClusterManagement.get_cluster(
                 cluster_id, machine)
     return cluster_management, payload
Example #5
0
 def get_cluster_info(self, ml_cluster_name, user_id):
     record = MLClusterInfo.select().where(
         (MLClusterInfo.namespace == ml_cluster_name)
         & (MLClusterInfo.userId == user_id)).get()
     cluster_management = ClusterManagement.get_cluster_management_object(
         ml_cluster_name)
     kube_cluster_name = KubeCluster.select().where(
         KubeCluster.id == record.clusterId).get().name
     kube_cluster_info = cluster_management.get_kube_cluster_info(
         kube_cluster_name)
     framework = GetObject.get_ml_object(record.type)
     mlClusterInfo = framework.get_cluster_info(ml_cluster_name,
                                                cluster_management, user_id)
     # mlClusterInfo.update({DATASET_VOLUME_MOUNT_POINT:DEFAULT_DATASET_MOUNT_PATH})
     # DATASET_VOLUME_MOUNT_POINT:DEFAULT_DATASET_MOUNT_PATH,
     # DATASET_VOLUME_MOUNT_PATH_IN_POD_REST:DATASET_VOLUME_MOUNT_PATH_IN_POD
     return {
         KUBE_CLUSTER_INFO: kube_cluster_info,
         ML_CLUSTER_INFO: mlClusterInfo
     }
    def create_cluster(payload):
        if USER_ID not in payload:
            payload[USER_ID] = DEFAULT_USER_ID
        cluster_management = ClusterManagement()
        cluster_name = payload[CLUSTER_NAME]
        master_ip = payload[CLUSTER_MASTER_IP]

        cluster = KubeCluster(name=cluster_name,
                              userId=payload[USER_ID],
                              status=StackStatus.getString(
                                  StackStatus.INPROGRESS.value),
                              startTime=datetime.datetime.now())
        cluster.save()

        master_machine = get_machine(master_ip, cluster.id)
        worker_ip_list = payload[CLUSTER_WORKER_IP_LIST]
        worker_machines = []

        for worker_ip in worker_ip_list:
            machine = get_worker_machine(worker_ip, cluster.id)
            worker_machines.append(machine)

        log.debug("Worker machines {0},Master machines {1}".format(
            worker_machines, master_machine))
        cluster_management.set_master_machines(master_machine)
        cluster_management.set_worker_machines(worker_machines)
        cluster_management.init_kube_cluster()
        cluster_management.get_kube_auth_token()
        log.debug("The auth token is {0}".format(
            cluster_management.kube_auth_token))
        cluster_management.get_joining_token()
        log.debug("The joining token is {0}".format(
            cluster_management.joining_token))
        cluster_management.get_kube_object()
        # self.start_heketi()
        cluster_management.create_storage_class()
        cluster_management.add_worker_nodes()
        cluster.status = StackStatus.getString(StackStatus.DONE.value)
        cluster.save()
        mount_gf_volume(cluster_management, DEFAULT_DATASET_VOLUME_NAME,
                        DEFAULT_DATASET_MOUNT_PATH)
        push_cluster_management_object(cluster_name, cluster_management)
Example #7
0
def get_cluster_id(cluster_name, user_id):
    cluster = KubeCluster.select().where((KubeCluster.name == cluster_name) & (
        KubeCluster.userId == user_id)).get()
    return cluster
 def get_cluster_info(self, cluster_name, user_id=1):
     cluster = KubeCluster.select().where(
         KubeCluster.name == cluster_name
         and KubeCluster.userId == user_id)[0]
     cluster_obj = GetObject.get_ml_object(cluster.frameworkType)
     return cluster_obj.get_cluster_info(cluster_name)