def get_container(train_op, train_env, train_num_gpus, drive='coco-headset-vol-1'): (train_op.container.set_memory_request('56Gi').set_memory_limit( '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit( str(train_num_gpus)).add_volume_mount( V1VolumeMount( name='tensorboard', mount_path='/shared/tensorboard')).add_volume_mount( V1VolumeMount(name='data', mount_path='/data/')).add_volume_mount( V1VolumeMount( name='shm', mount_path='/dev/shm'))) (add_env(add_ssh_volume(train_op), train_env).add_toleration( V1Toleration(key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')).add_node_selector_constraint( 'beta.kubernetes.io/instance-type', f'p3.{2 * train_num_gpus}xlarge'). add_volume( V1Volume(name='tensorboard', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'tensorboard-research-kf')) ).add_volume( V1Volume(name='data', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( drive))) # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm'))) .add_volume( V1Volume(name='shm', empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
def train_eval_epic(owner, project, experiment, model, git_rev, pretrained_s3, mode, train_additional_args='', eval_additional_args=''): train_env = {} train_num_gpus = 1 train_op = components.load_component_from_file('components/train.yaml')( owner=owner, project=project, experiment=experiment, model=model, git_rev=git_rev, pretrained_s3=pretrained_s3, mode=mode, additional_args=train_additional_args) (train_op.container.set_memory_request('56Gi').set_memory_limit( '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit( str(train_num_gpus)).add_volume_mount( V1VolumeMount( name='tensorboard', mount_path='/shared/tensorboard')).add_volume_mount( V1VolumeMount(name='data', mount_path='/data/')).add_volume_mount( V1VolumeMount( name='shm', mount_path='/dev/shm'))) (add_env(add_ssh_volume(train_op), train_env).add_toleration( V1Toleration(key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')).add_node_selector_constraint( 'beta.kubernetes.io/instance-type', f'p3.{2*train_num_gpus}xlarge'). add_volume( V1Volume(name='tensorboard', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'tensorboard-research-kf')) ).add_volume( V1Volume(name='data', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'dataset-epic-kitchen'))) # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm'))) .add_volume( V1Volume(name='shm', empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
def add_ssh_volume(op): op.add_volume( V1Volume(name='ssh-v', secret=V1SecretVolumeSource( secret_name='ssh-secrets-epic-kitchen-kbbbtt9c94', default_mode=0o600))) op.container.add_volume_mount( V1VolumeMount(name='ssh-v', mount_path='/root/.ssh')) return op
def pipeline_mount_pvc(): pvc_name = "kfp-pvc" volume_name = 'pipeline' volume_mount_path = '/mnt/pipeline' dsl.ContainerOp( name='mnist_pvc', image='kangwoo/kfp-mnist-storage:0.0.1', arguments=['--model', '/mnt/pipeline/kfp/mnist/model'] ).add_volume(V1Volume(name=volume_name, persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name=pvc_name))) \ .add_volume_mount(V1VolumeMount(mount_path=volume_mount_path, name=volume_name))
def pipeline_gcs(): GCSCredentialFileName = "user-gcp-sa.json" GCSCredentialVolumeName = "user-gcp-sa" GCSCredentialVolumeMountPath = "/var/secrets/" GCSCredentialEnvKey = "GOOGLE_APPLICATION_CREDENTIALS" GCSCredentialFilePath = os.path.join(GCSCredentialVolumeMountPath, GCSCredentialFileName) secret_name = 'user-gcp-sa' dsl.ContainerOp( name='mnist-gcs', image='kangwoo/kfp-mnist-storage:0.0.1', arguments=['--model', 'gs://kfp-bucket/kfp/mnist/model'] ).add_volume(V1Volume(name=GCSCredentialVolumeName, secret=V1SecretVolumeSource(secret_name=secret_name))) \ .add_volume_mount(V1VolumeMount(name=GCSCredentialVolumeName, mount_path=GCSCredentialVolumeMountPath)) \ .add_env_variable(V1EnvVar(name=GCSCredentialEnvKey, value=GCSCredentialFilePath))
def make_pod(name, cmd, port, image_spec, image_pull_policy, image_pull_secret=None, node_selector=None, run_as_uid=None, run_as_gid=None, fs_gid=None, supplemental_gids=None, run_privileged=False, env={}, working_dir=None, volumes=[], volume_mounts=[], labels={}, annotations={}, cpu_limit=None, cpu_guarantee=None, mem_limit=None, mem_guarantee=None, extra_resource_limits=None, extra_resource_guarantees=None, lifecycle_hooks=None, init_containers=None, service_account=None, extra_container_config=None, extra_pod_config=None, extra_containers=None, scheduler_name=None): """ Make a k8s pod specification for running a user notebook. Parameters ---------- name: Name of pod. Must be unique within the namespace the object is going to be created in. Must be a valid DNS label. image_spec: Image specification - usually a image name and tag in the form of image_name:tag. Same thing you would use with docker commandline arguments image_pull_policy: Image pull policy - one of 'Always', 'IfNotPresent' or 'Never'. Decides when kubernetes will check for a newer version of image and pull it when running a pod. image_pull_secret: Image pull secret - Default is None -- set to your secret name to pull from private docker registry. port: Port the notebook server is going to be listening on cmd: The command used to execute the singleuser server. node_selector: Dictionary Selector to match nodes where to launch the Pods run_as_uid: The UID used to run single-user pods. The default is to run as the user specified in the Dockerfile, if this is set to None. run_as_gid: The GID used to run single-user pods. The default is to run as the primary group of the user specified in the Dockerfile, if this is set to None. fs_gid The gid that will own any fresh volumes mounted into this pod, if using volume types that support this (such as GCE). This should be a group that the uid the process is running as should be a member of, so that it can read / write to the volumes mounted. supplemental_gids: A list of GIDs that should be set as additional supplemental groups to the user that the container runs as. You may have to set this if you are deploying to an environment with RBAC/SCC enforced and pods run with a 'restricted' SCC which results in the image being run as an assigned user ID. The supplemental group IDs would need to include the corresponding group ID of the user ID the image normally would run as. The image must setup all directories/files any application needs access to, as group writable. run_privileged: Whether the container should be run in privileged mode. env: Dictionary of environment variables. volumes: List of dictionaries containing the volumes of various types this pod will be using. See k8s documentation about volumes on how to specify these volume_mounts: List of dictionaries mapping paths in the container and the volume( specified in volumes) that should be mounted on them. See the k8s documentaiton for more details working_dir: String specifying the working directory for the notebook container labels: Labels to add to the spawned pod. annotations: Annotations to add to the spawned pod. cpu_limit: Float specifying the max number of CPU cores the user's pod is allowed to use. cpu_guarentee: Float specifying the max number of CPU cores the user's pod is guaranteed to have access to, by the scheduler. mem_limit: String specifying the max amount of RAM the user's pod is allowed to use. String instead of float/int since common suffixes are allowed mem_guarantee: String specifying the max amount of RAM the user's pod is guaranteed to have access to. String ins loat/int since common suffixes are allowed lifecycle_hooks: Dictionary of lifecycle hooks init_containers: List of initialization containers belonging to the pod. service_account: Service account to mount on the pod. None disables mounting extra_container_config: Extra configuration (e.g. envFrom) for notebook container which is not covered by parameters above. extra_pod_config: Extra configuration (e.g. tolerations) for pod which is not covered by parameters above. extra_containers: Extra containers besides notebook container. Used for some housekeeping jobs (e.g. crontab). scheduler_name: A custom scheduler's name. """ pod = V1Pod() pod.kind = "Pod" pod.api_version = "v1" pod.metadata = V1ObjectMeta(name=name, labels=labels.copy(), annotations=annotations.copy()) pod.spec = V1PodSpec(containers=[]) pod.spec.restartPolicy = 'Never' security_context = V1PodSecurityContext() if fs_gid is not None: security_context.fs_group = int(fs_gid) if supplemental_gids is not None and supplemental_gids: security_context.supplemental_groups = [ int(gid) for gid in supplemental_gids ] if run_as_uid is not None: security_context.run_as_user = int(run_as_uid) if run_as_gid is not None: security_context.run_as_group = int(run_as_gid) pod.spec.security_context = security_context if image_pull_secret is not None: pod.spec.image_pull_secrets = [] image_secret = V1LocalObjectReference() image_secret.name = image_pull_secret pod.spec.image_pull_secrets.append(image_secret) if node_selector: pod.spec.node_selector = node_selector notebook_container = V1Container( name='notebook', image=image_spec, working_dir=working_dir, ports=[V1ContainerPort(name='notebook-port', container_port=port)], env=[V1EnvVar(k, v) for k, v in env.items()], args=cmd, image_pull_policy=image_pull_policy, lifecycle=lifecycle_hooks, resources=V1ResourceRequirements()) if service_account is None: # Add a hack to ensure that no service accounts are mounted in spawned pods # This makes sure that we don"t accidentally give access to the whole # kubernetes API to the users in the spawned pods. # Note: We don't simply use `automountServiceAccountToken` here since we wanna be compatible # with older kubernetes versions too for now. hack_volume = V1Volume(name='no-api-access-please', empty_dir={}) hack_volumes = [hack_volume] hack_volume_mount = V1VolumeMount( name='no-api-access-please', mount_path="/var/run/secrets/kubernetes.io/serviceaccount", read_only=True) hack_volume_mounts = [hack_volume_mount] # Non-hacky way of not mounting service accounts pod.spec.automount_service_account_token = False else: hack_volumes = [] hack_volume_mounts = [] pod.spec.service_account_name = service_account if run_privileged: notebook_container.security_context = V1SecurityContext( privileged=True) notebook_container.resources.requests = {} if cpu_guarantee: notebook_container.resources.requests['cpu'] = cpu_guarantee if mem_guarantee: notebook_container.resources.requests['memory'] = mem_guarantee if extra_resource_guarantees: for k in extra_resource_guarantees: notebook_container.resources.requests[ k] = extra_resource_guarantees[k] notebook_container.resources.limits = {} if cpu_limit: notebook_container.resources.limits['cpu'] = cpu_limit if mem_limit: notebook_container.resources.limits['memory'] = mem_limit if extra_resource_limits: for k in extra_resource_limits: notebook_container.resources.limits[k] = extra_resource_limits[k] notebook_container.volume_mounts = volume_mounts + hack_volume_mounts pod.spec.containers.append(notebook_container) if extra_container_config: for key, value in extra_container_config.items(): setattr(notebook_container, _map_attribute(notebook_container.attribute_map, key), value) if extra_pod_config: for key, value in extra_pod_config.items(): setattr(pod.spec, _map_attribute(pod.spec.attribute_map, key), value) if extra_containers: pod.spec.containers.extend(extra_containers) pod.spec.init_containers = init_containers pod.spec.volumes = volumes + hack_volumes if scheduler_name: pod.spec.scheduler_name = scheduler_name return pod
def __init__(self, notebook: str, cos_endpoint: str, cos_bucket: str, cos_directory: str, cos_dependencies_archive: str, pipeline_outputs: Optional[List[str]] = None, pipeline_inputs: Optional[List[str]] = None, pipeline_envs: Optional[Dict[str, str]] = None, requirements_url: str = None, bootstrap_script_url: str = None, emptydir_volume_size: str = None, **kwargs): """Create a new instance of ContainerOp. Args: notebook: name of the notebook that will be executed per this operation cos_endpoint: object storage endpoint e.g weaikish1.fyre.ibm.com:30442 cos_bucket: bucket to retrieve archive from cos_directory: name of the directory in the object storage bucket to pull cos_dependencies_archive: archive file name to get from object storage bucket e.g archive1.tar.gz pipeline_outputs: comma delimited list of files produced by the notebook pipeline_inputs: comma delimited list of files to be consumed/are required by the notebook pipeline_envs: dictionary of environmental variables to set in the container prior to execution requirements_url: URL to a python requirements.txt file to be installed prior to running the notebook bootstrap_script_url: URL to a custom python bootstrap script to run emptydir_volume_size: Size(GB) of the volume to create for the workspace when using CRIO container runtime kwargs: additional key value pairs to pass e.g. name, image, sidecars & is_exit_handler. See Kubeflow pipelines ContainerOp definition for more parameters or how to use https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ContainerOp """ self.notebook = notebook self.notebook_name = self._get_file_name_with_extension(notebook, 'ipynb') self.cos_endpoint = cos_endpoint self.cos_bucket = cos_bucket self.cos_directory = cos_directory self.cos_dependencies_archive = cos_dependencies_archive self.container_work_dir_root_path = "./" self.container_work_dir_name = "jupyter-work-dir/" self.container_work_dir = self.container_work_dir_root_path + self.container_work_dir_name self.bootstrap_script_url = bootstrap_script_url self.requirements_url = requirements_url self.pipeline_outputs = pipeline_outputs self.pipeline_inputs = pipeline_inputs self.pipeline_envs = pipeline_envs argument_list = [] """ CRI-o support for kfp pipelines We need to attach an emptydir volume for each notebook that runs since CRI-o runtime does not allow us to write to the base image layer file system, only to volumes. """ self.emptydir_volume_name = "workspace" self.emptydir_volume_size = emptydir_volume_size self.python_user_lib_path = '' self.python_user_lib_path_target = '' self.python_pip_config_url = '' if self.emptydir_volume_size: self.container_work_dir_root_path = "/opt/app-root/src/" self.container_python_dir_name = "python3/" self.container_work_dir = self.container_work_dir_root_path + self.container_work_dir_name self.python_user_lib_path = self.container_work_dir + self.container_python_dir_name self.python_user_lib_path_target = '--target=' + self.python_user_lib_path self.python_pip_config_url = 'https://raw.githubusercontent.com/{org}/' \ 'kfp-notebook/{branch}/etc/pip.conf'. \ format(org=KFP_NOTEBOOK_ORG, branch=KFP_NOTEBOOK_BRANCH) if not self.bootstrap_script_url: self.bootstrap_script_url = 'https://raw.githubusercontent.com/{org}/' \ 'kfp-notebook/{branch}/etc/docker-scripts/bootstrapper.py'.\ format(org=KFP_NOTEBOOK_ORG, branch=KFP_NOTEBOOK_BRANCH) if not self.requirements_url: self.requirements_url = 'https://raw.githubusercontent.com/{org}/' \ 'kfp-notebook/{branch}/etc/requirements-elyra.txt'.\ format(org=KFP_NOTEBOOK_ORG, branch=KFP_NOTEBOOK_BRANCH) if 'image' not in kwargs: raise ValueError("You need to provide an image.") if not notebook: raise ValueError("You need to provide a notebook.") if 'arguments' not in kwargs: """ If no arguments are passed, we use our own. If ['arguments'] are set, we assume container's ENTRYPOINT is set and dependencies are installed NOTE: Images being pulled must have python3 available on PATH and cURL utility """ argument_list.append('mkdir -p {container_work_dir} && cd {container_work_dir} && ' 'curl -H "Cache-Control: no-cache" -L {bootscript_url} --output bootstrapper.py && ' 'curl -H "Cache-Control: no-cache" -L {reqs_url} --output requirements-elyra.txt && ' .format(container_work_dir=self.container_work_dir, bootscript_url=self.bootstrap_script_url, reqs_url=self.requirements_url) ) if self.emptydir_volume_size: argument_list.append('mkdir {container_python_dir} && cd {container_python_dir} && ' 'curl -H "Cache-Control: no-cache" -L {python_pip_config_url} ' '--output pip.conf && cd .. &&' .format(python_pip_config_url=self.python_pip_config_url, container_python_dir=self.container_python_dir_name) ) argument_list.append('python3 -m pip install {python_user_lib_path_target} packaging && ' 'python3 -m pip freeze > requirements-current.txt && ' 'python3 bootstrapper.py ' '--cos-endpoint {cos_endpoint} ' '--cos-bucket {cos_bucket} ' '--cos-directory "{cos_directory}" ' '--cos-dependencies-archive "{cos_dependencies_archive}" ' '--file "{notebook}" ' .format(cos_endpoint=self.cos_endpoint, cos_bucket=self.cos_bucket, cos_directory=self.cos_directory, cos_dependencies_archive=self.cos_dependencies_archive, notebook=self.notebook, python_user_lib_path_target=self.python_user_lib_path_target) ) if self.pipeline_inputs: inputs_str = self._artifact_list_to_str(self.pipeline_inputs) argument_list.append('--inputs "{}" '.format(inputs_str)) if self.pipeline_outputs: outputs_str = self._artifact_list_to_str(self.pipeline_outputs) argument_list.append('--outputs "{}" '.format(outputs_str)) if self.emptydir_volume_size: argument_list.append('--user-volume-path "{}" '.format(self.python_user_lib_path)) kwargs['command'] = ['sh', '-c'] kwargs['arguments'] = "".join(argument_list) super().__init__(**kwargs) # We must deal with the envs after the superclass initialization since these amend the # container attribute that isn't available until now. if self.pipeline_envs: for key, value in self.pipeline_envs.items(): # Convert dict entries to format kfp needs self.container.add_env_variable(V1EnvVar(name=key, value=value)) # If crio volume size is found then assume kubeflow pipelines environment is using CRI-o as # its container runtime if self.emptydir_volume_size: self.add_volume(V1Volume(empty_dir=V1EmptyDirVolumeSource( medium="", size_limit=self.emptydir_volume_size), name=self.emptydir_volume_name)) self.container.add_volume_mount(V1VolumeMount(mount_path=self.container_work_dir_root_path, name=self.emptydir_volume_name)) # Append to PYTHONPATH location of elyra dependencies in installed in Volume self.container.add_env_variable(V1EnvVar(name='PYTHONPATH', value=self.python_user_lib_path))