Exemple #1
0
 def data_path_nfs_mountpoint(self):
     assert isinstance(self.data_path, str)
     server = self.get_cluster_nfs_server()
     path = self.get_nfs_path_with_folder("storage", self.data_path)
     mp = make_mountpoint(
         params={
             "name": "data",
             "mountPath": "/data",
             "mountType": "nfs",
             "server": server,
             "path": path
         })
     logger.info("job %s has data path nfs mountpoint: %s", self.job_id, mp)
     return mp
Exemple #2
0
 def work_path_nfs_mountpoint(self):
     assert isinstance(self.work_path, str) and len(self.work_path) > 0
     server = self.get_cluster_nfs_server()
     path = self.get_nfs_path_with_folder("work", self.work_path)
     mp = make_mountpoint(
         params={
             "name": "work",
             "mountPath": "/work",
             "mountType": "nfs",
             "server": server,
             "path": path
         })
     logger.info("job %s has work path nfs mountpoint: %s", self.job_id, mp)
     return mp
Exemple #3
0
 def home_path_nfs_mountpoint(self):
     alias = self.get_alias()
     server = self.get_cluster_nfs_server()
     path = self.get_nfs_path_with_folder("work", alias)
     mp = make_mountpoint(
         params={
             "name": "home",
             "mountPath": "/home/%s" % alias,
             "mountType": "nfs",
             "server": server,
             "path": path
         })
     logger.info("job %s has home path nfs mountpoint: %s", self.job_id, mp)
     return mp
Exemple #4
0
    def infiniband_mountpoints(self):
        infiniband_mounts = self.get_infiniband_mounts()
        if not isinstance(infiniband_mounts, list):
            return None

        ib_mountpoints = []
        for infiniband_mount in infiniband_mounts:
            ib_mp = make_mountpoint(
                params={
                    "name": infiniband_mount["name"].lower(),
                    "mountPath": infiniband_mount["containerPath"],
                    "hostPath": infiniband_mount["hostPath"],
                    "mountType": "hostPath",
                })
            ib_mountpoints.append(ib_mp)

        return ib_mountpoints
Exemple #5
0
 def system_mountpoints(self):
     """Returns all system defined mountpoints for this job. They can be
     NFS mountpoints, hostPath mountpoints, and many to be defined. If vc is
     undefined, the mountpoint is a cluster shared mountpoint.
     """
     vc_name = self.params["vcName"]
     mp_params = [
         mp for mp in self.get_system_mountpoints()
         if mp.get("vc") is None or mp.get("vc") == vc_name
     ]
     mps = []
     for mp_param in mp_params:
         mp = make_mountpoint(mp_param)
         if mp is not None:
             logger.info("job %s has mountpoint: %s", self.job_id, mp)
             mps.append(mp)
         else:
             logger.warning("job %s has mountpoint for param %s None",
                            self.job_id, mp_param)
     return mps
    def generate_params(self, job):
        """
        Return (pods, errors)
        """
        assert (isinstance(job, Job))
        params = job.params

        if any(required_field not in params for required_field in [
                "jobtrainingtype",
                "jobName",
                "jobPath",
                "workPath",
                "dataPath",
                "cmd",
                "userId",
                "resourcegpu",
                "userName",
                "vcName",
                "sku",
        ]):
            return None, "Missing required parameters!"

        # Add /job, /work, /home/<alias>, /data
        job.job_path = params["jobPath"]
        job.work_path = params["workPath"]
        job.data_path = params["dataPath"]

        # Add /job
        job.add_mountpoints(job.job_path_nfs_mountpoint())

        # Add /home/<alias>, /work, /data.
        # Some clusters have /data as dedicated storage for 1 VC.
        # Other VCs should not be able to access /data.
        vc_without_shared_storage = job.get_vc_without_shared_storage()
        if params["vcName"] not in vc_without_shared_storage:
            job.add_mountpoints(job.home_path_nfs_mountpoint())
            job.add_mountpoints(job.work_path_nfs_mountpoint())
            job.add_mountpoints(job.data_path_nfs_mountpoint())

        # Add system provided job mountpoints
        job.add_mountpoints(job.system_mountpoints())

        # Add user provided job mountpoints
        if "mountpoints" in params:
            for mountpoint_params in params["mountpoints"]:
                job.add_mountpoints(make_mountpoint(mountpoint_params))

        params["init-container"] = os.environ["INIT_CONTAINER_IMAGE"]
        params["user_email"] = params["userName"]
        params["pod_ip_range"] = job.get_pod_ip_range()

        if "nodeSelector" not in params:
            params["nodeSelector"] = {}
        if "sku" in params:
            params["nodeSelector"]["sku"] = params["sku"]

        # Set up VC dedicated node usage
        vc_node_hard_assignment = job.get_vc_node_hard_assignment()
        if isinstance(vc_node_hard_assignment, dict):
            vc = params["vcName"]
            # TODO: Fix the case where CPU worker exists in a GPU pool
            if vc in vc_node_hard_assignment and \
                    vc_node_hard_assignment[vc] is True:
                params["nodeSelector"]["vc"] = vc
            else:
                params["nodeSelector"]["vc"] = "default"

        if "envs" not in params:
            params["envs"] = []

        params["envs"].append({
            "name": "DLWS_NUM_GPU_PER_WORKER",
            "value": str(params["resourcegpu"])
        })
        params["envs"].append({
            "name": "DLTS_NUM_GPU_PER_WORKER",
            "value": str(params["resourcegpu"])
        })

        job.add_plugins(job.get_plugins())
        params["plugins"] = job.plugins

        # Must be after job.get_plugins
        # TODO: Make mountpoints independent of job.get_plugins
        params["mountpoints"] = [mp.to_dict() for mp in job.mountpoints]

        # Set up system environment variables if any
        system_envs = job.get_system_envs()
        for env_name, env_val in system_envs.items():
            params["envs"].append({"name": env_name, "value": env_val})

        return params, None
Exemple #7
0
    def get_blobfuse_plugins(self, plugins):
        """Constructs and returns a list of blobfuse plugins."""

        enable_blobfuse = self.get_enable_blobfuse()
        if enable_blobfuse is None or enable_blobfuse is False:
            return []

        def identical(e1, e2):
            return e1["name"] == e2["name"] or \
                e1["mountPath"] == e2["mountPath"]

        root_tmppath = None
        local_fast_storage = self.get_local_fast_storage()
        if local_fast_storage is not None and local_fast_storage != "":
            root_tmppath = local_fast_storage.rstrip("/")

        blobfuses = []
        for i, p_bf in enumerate(plugins):
            account_name = p_bf.get("accountName")
            account_key = p_bf.get("accountKey")
            container_name = p_bf.get("containerName")
            mount_path = p_bf.get("mountPath")
            mount_options = p_bf.get("mountOptions")

            # Ignore Azure blobfuse with incomplete configurations
            if invalid_entry(account_name) or \
                    invalid_entry(account_key) or \
                    invalid_entry(container_name) or \
                    invalid_entry(mount_path):
                continue

            name = p_bf.get("name")
            if name is None:
                name = "%s-blobfuse-%d" % (self.job_id, i)

            # Reassign everything for clarity
            bf = {
                "enabled": True,
                "name": name,
                "secreds": "%s-blobfuse-%d-secreds" % (self.job_id, i),
                "accountName": b64encode(account_name),
                "accountKey": b64encode(account_key),
                "containerName": container_name,
                "mountPath": mount_path,
                "jobId": self.job_id,
            }

            if root_tmppath is not None:
                # Make tmppath unique for each blobfuse mount
                bf["rootTmppath"] = root_tmppath
                bf["tmppath"] = name

            # Also support a list of strings
            if isinstance(mount_options, list):
                mount_options = " ".join(mount_options)

            if not invalid_entry(mount_options):
                bf["mountOptions"] = mount_options

            # TODO: Refactor into mountpoint add
            blobfuses = dedup_add(bf, blobfuses, identical)

            # Add to mountpoints
            bf["mountType"] = "blobfuse"
            bf_mp = make_mountpoint(bf)
            if bf_mp is not None:
                self.add_mountpoints(bf_mp)

        return blobfuses