def make_pvc(name, storage_class, access_modes, storage, labels): """ Make a k8s pvc specification for running a user notebook. Parameters: - name: Name of persistent volume claim. Must be unique within the namespace the object is going to be created in. Must be a valid DNS label. - storage_class String of the name of the k8s Storage Class to use. - access_modes: A list of specifying what access mode the pod should have towards the pvc - storage The ammount of storage needed for the pvc """ pvc = V1PersistentVolumeClaim() pvc.kind = "PersistentVolumeClaim" pvc.api_version = "v1" pvc.metadata = V1ObjectMeta() pvc.metadata.name = name pvc.metadata.annotations = {} if storage_class: pvc.metadata.annotations.update( {"volume.beta.kubernetes.io/storage-class": storage_class}) pvc.metadata.labels = {} pvc.metadata.labels.update(labels) pvc.spec = V1PersistentVolumeClaimSpec() pvc.spec.access_modes = access_modes pvc.spec.resources = V1ResourceRequirements() pvc.spec.resources.requests = {"storage": storage} return pvc
def make_pvc( name, storage_class, access_modes, selector, storage, labels=None, annotations=None, ): """ Make a k8s pvc specification for running a user notebook. Parameters ---------- name: Name of persistent volume claim. Must be unique within the namespace the object is going to be created in. Must be a valid DNS label. storage_class: String of the name of the k8s Storage Class to use. access_modes: A list of specifying what access mode the pod should have towards the pvc selector: Dictionary Selector to match pvc to pv. storage: The ammount of storage needed for the pvc """ pvc = V1PersistentVolumeClaim() pvc.kind = "PersistentVolumeClaim" pvc.api_version = "v1" pvc.metadata = V1ObjectMeta() pvc.metadata.name = name pvc.metadata.annotations = (annotations or {}).copy() pvc.metadata.labels = (labels or {}).copy() pvc.spec = V1PersistentVolumeClaimSpec() pvc.spec.access_modes = access_modes pvc.spec.resources = V1ResourceRequirements() pvc.spec.resources.requests = {"storage": storage} if storage_class is not None: pvc.metadata.annotations.update( {"volume.beta.kubernetes.io/storage-class": storage_class} ) pvc.spec.storage_class_name = storage_class if selector: pvc.spec.selector = selector return pvc
def training_pipeline(): pvc = V1PersistentVolumeClaim(api_version="v1", kind="PersistentVolumeClaim", metadata={ 'name': f'vol-{self._experiment_id}', 'namespace': KF_PIPELINES_NAMESPACE, }, spec={ 'accessModes': ['ReadWriteOnce'], 'resources': { 'requests': { 'storage': '1Gi' } } }) wrkdirop = dsl.VolumeOp(name=TRAINING_DATASETS_VOLUME_NAME, k8s_resource=pvc, action="apply") # Create container_op for all operators for _, operator in self._operators.items(): operator.create_container_op() operator.container_op.container \ .set_memory_request(MEMORY_REQUEST) \ .set_memory_limit(MEMORY_LIMIT) \ .set_cpu_request(CPU_REQUEST) \ .set_cpu_limit(CPU_LIMIT) # Define operators volumes and dependecies for operator_id, operator in self._operators.items(): if operator_id not in self._roots: dependencies = self._inverted_edges[operator_id] dependencies_ops = [ self._get_operator(d).container_op for d in dependencies ] operator.container_op.after(*dependencies_ops) operator.container_op.add_pvolumes( {TRAINING_DATASETS_DIR: wrkdirop.volume})
def make_pvc(old_pvc, storage_class): pvc = V1PersistentVolumeClaim() pvc.kind = "PersistentVolumeClaim" pvc.api_version = "v1" pvc.metadata = V1ObjectMeta() pvc.metadata.name = old_pvc['metadata']['name'] username = old_pvc['metadata']['annotations']['hub.jupyter.org/username'] pvc.metadata.annotations = {'hub.jupyter.org/username': username} pvc.metadata.labels = old_pvc['metadata']['labels'].copy() pvc.spec = V1PersistentVolumeClaimSpec() pvc.spec.access_modes = old_pvc['spec']['accessModes'].copy() pvc.spec.resources = V1ResourceRequirements() pvc.spec.resources.requests = { "storage": old_pvc['spec']['resources']['requests']['storage'] } if storage_class: pvc.metadata.annotations.update( {"volume.beta.kubernetes.io/storage-class": storage_class}) pvc.spec.storage_class_name = storage_class return pvc
def create_volume_op(name): """ Creates a kfp.dsl.VolumeOp container. Parameters ---------- name : str Returns ------- kfp.dsl.ContainerOp """ pvc = V1PersistentVolumeClaim( api_version="v1", kind="PersistentVolumeClaim", metadata={ "name": f"vol-{name}", "namespace": KF_PIPELINES_NAMESPACE, }, spec={ "accessModes": ["ReadWriteOnce"], "resources": { "requests": { "storage": "10Gi", }, }, }, ) volume_op = dsl.VolumeOp( name=f"vol-{name}", k8s_resource=pvc, action="apply", ) return volume_op
def __init__(self, resource_name: str = None, size: str = None, storage_class: str = None, modes: List[str] = None, annotations: Dict[str, str] = None, data_source=None, volume_name=None, **kwargs): # Add size to attribute outputs self.attribute_outputs = {"size": "{.status.capacity.storage}"} if "k8s_resource" in kwargs: if resource_name or size or storage_class or modes or annotations: raise ValueError("You cannot provide k8s_resource along with " "other arguments.") if not isinstance(kwargs["k8s_resource"], V1PersistentVolumeClaim): raise ValueError("k8s_resource in VolumeOp must be an instance" " of V1PersistentVolumeClaim") super().__init__(**kwargs) self.volume = PipelineVolume(name=sanitize_k8s_name(self.name), pvc=self.outputs["name"]) return if not size: raise ValueError("Please provide size") elif not match_serialized_pipelineparam(str(size)): self._validate_memory_string(size) if data_source and not isinstance( data_source, (str, PipelineParam, V1TypedLocalObjectReference)): raise ValueError("data_source can be one of (str, PipelineParam, " "V1TypedLocalObjectReference).") if data_source and isinstance(data_source, (str, PipelineParam)): data_source = V1TypedLocalObjectReference( api_group="snapshot.storage.k8s.io", kind="VolumeSnapshot", name=data_source) # Set the k8s_resource if not match_serialized_pipelineparam(str(resource_name)): resource_name = sanitize_k8s_name(resource_name) pvc_metadata = V1ObjectMeta(name="{{workflow.name}}-%s" % resource_name, annotations=annotations) requested_resources = V1ResourceRequirements( requests={"storage": size}) pvc_spec = V1PersistentVolumeClaimSpec( access_modes=modes or VOLUME_MODE_RWM, resources=requested_resources, storage_class_name=storage_class, data_source=data_source, volume_name=volume_name) k8s_resource = V1PersistentVolumeClaim(api_version="v1", kind="PersistentVolumeClaim", metadata=pvc_metadata, spec=pvc_spec) super().__init__( k8s_resource=k8s_resource, **kwargs, ) self.volume = PipelineVolume(name=sanitize_k8s_name(self.name), pvc=self.outputs["name"])
def __init__(self, resource_name: str = None, size: str = None, storage_class: str = None, modes: List[str] = VOLUME_MODE_RWM, annotations: Dict[str, str] = None, data_source=None, **kwargs): """Create a new instance of VolumeOp. Args: resource_name: A desired name for the PVC which will be created size: The size of the PVC which will be created storage_class: The storage class to use for the dynamically created PVC modes: The access modes for the PVC annotations: Annotations to be patched in the PVC data_source: May be a V1TypedLocalObjectReference, and then it is used in the data_source field of the PVC as is. Can also be a string/PipelineParam, and in that case it will be used as a VolumeSnapshot name (Alpha feature) kwargs: See ResourceOp definition Raises: ValueError: if k8s_resource is provided along with other arguments if k8s_resource is not a V1PersistentVolumeClaim if size is None if size is an invalid memory string (when not a PipelineParam) if data_source is not one of (str, PipelineParam, V1TypedLocalObjectReference) """ # Add size to attribute outputs self.attribute_outputs = {"size": "{.status.capacity.storage}"} if "k8s_resource" in kwargs: if resource_name or size or storage_class or modes or annotations: raise ValueError("You cannot provide k8s_resource along with " "other arguments.") if not isinstance(kwargs["k8s_resource"], V1PersistentVolumeClaim): raise ValueError("k8s_resource in VolumeOp must be an instance" " of V1PersistentVolumeClaim") super().__init__(**kwargs) self.volume = PipelineVolume(name=sanitize_k8s_name(self.name), pvc=self.outputs["name"]) return if not size: raise ValueError("Please provide size") elif not match_serialized_pipelineparam(str(size)): self._validate_memory_string(size) if data_source and not isinstance( data_source, (str, PipelineParam, V1TypedLocalObjectReference)): raise ValueError("data_source can be one of (str, PipelineParam, " "V1TypedLocalObjectReference).") if data_source and isinstance(data_source, (str, PipelineParam)): data_source = V1TypedLocalObjectReference( api_group="snapshot.storage.k8s.io", kind="VolumeSnapshot", name=data_source) # Set the k8s_resource if not match_serialized_pipelineparam(str(resource_name)): resource_name = sanitize_k8s_name(resource_name) pvc_metadata = V1ObjectMeta(name="{{workflow.name}}-%s" % resource_name, annotations=annotations) requested_resources = V1ResourceRequirements( requests={"storage": size}) pvc_spec = V1PersistentVolumeClaimSpec( access_modes=modes, resources=requested_resources, storage_class_name=storage_class, data_source=data_source) k8s_resource = V1PersistentVolumeClaim(api_version="v1", kind="PersistentVolumeClaim", metadata=pvc_metadata, spec=pvc_spec) super().__init__( k8s_resource=k8s_resource, **kwargs, ) self.volume = PipelineVolume(name=sanitize_k8s_name(self.name), pvc=self.outputs["name"])
def nlp_pipeline( csv_url="https://raw.githubusercontent.com/axsauze/reddit-classification-exploration/master/data/reddit_train.csv", csv_encoding="ISO-8859-1", features_column="BODY", labels_column="REMOVED", raw_text_path='/mnt/text.data', labels_path='/mnt/labels.data', clean_text_path='/mnt/clean.data', spacy_tokens_path='/mnt/tokens.data', tfidf_vectors_path='/mnt/tfidf.data', lr_prediction_path='/mnt/prediction.data', tfidf_model_path='/mnt/tfidf.model', lr_model_path='/mnt/lr.model', lr_c_param=0.1, tfidf_max_features=10000, tfidf_ngram_range=3, batch_size='100', github_branch='master'): """ Pipeline """ pvc_metadata = V1ObjectMeta(name="{{workflow.name}}-my-pvc", labels={ "branch": "{{workflow.parameters.github-branch}}", "app": "nlp" }) requested_resources = V1ResourceRequirements(requests={"storage": "1Gi"}) pvc_spec = V1PersistentVolumeClaimSpec(access_modes=["ReadWriteOnce"], resources=requested_resources) pvc = V1PersistentVolumeClaim(api_version="v1", kind="PersistentVolumeClaim", metadata=pvc_metadata, spec=pvc_spec) vop = dsl.VolumeOp(name="create-pvc", k8s_resource=pvc, modes=None) download_step = dsl.ContainerOp( name='data_downloader', image='maximmold/data_downloader:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--labels-path", labels_path, "--features-path", raw_text_path, "--csv-url", csv_url, "--csv-encoding", csv_encoding, "--features-column", features_column, "--labels-column", labels_column ], pvolumes={"/mnt": vop.volume}) clean_step = dsl.ContainerOp(name='clean_text', image='maximmold/clean_text_transformer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", raw_text_path, "--out-path", clean_text_path, ], pvolumes={"/mnt": download_step.pvolume}) tokenize_step = dsl.ContainerOp(name='tokenize', image='maximmold/spacy_tokenizer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", clean_text_path, "--out-path", spacy_tokens_path, ], pvolumes={"/mnt": clean_step.pvolume}) vectorize_step = dsl.ContainerOp(name='vectorize', image='maximmold/tfidf_vectorizer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", spacy_tokens_path, "--out-path", tfidf_vectors_path, "--max-features", tfidf_max_features, "--ngram-range", tfidf_ngram_range, "--action", "train", "--model-path", tfidf_model_path, ], pvolumes={"/mnt": tokenize_step.pvolume}) predict_step = dsl.ContainerOp(name='predictor', image='maximmold/lr_text_classifier:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", tfidf_vectors_path, "--labels-path", labels_path, "--out-path", lr_prediction_path, "--c-param", lr_c_param, "--action", "train", "--model-path", lr_model_path, ], pvolumes={"/mnt": vectorize_step.pvolume}) try: seldon_config = yaml.load( open("../deploy_pipeline/seldon_production_pipeline.yaml")) except: # If this file is run from the project core directory seldon_config = yaml.load( open("deploy_pipeline/seldon_production_pipeline.yaml")) deploy_step = dsl.ResourceOp( action="apply", name="seldondeploy", k8s_resource=seldon_config, attribute_outputs={"name": "{.metadata.name}"}) deploy_step.after(predict_step) delete_previous_pvc = dsl.ContainerOp( name="deletepreviouspvc", image="bitnami/kubectl", command="kubectl", arguments=[ "delete", "pvc", "-l", "app=nlp,branch={{workflow.parameters.github-branch}}", "--field-selector", "metadata.name!={{workflow.name}}-my-pvc", "--grace-period=0", "--force", "--wait=false" ]) delete_previous_pvc.after(deploy_step) patch_pvc_finalizer = dsl.ContainerOp( name="patchpvcfinalizer", image="bitnami/kubectl", command=["bash"], arguments=[ "-c", 'for j in $(kubectl get pvc -o name -l app=nlp,branch={{workflow.parameters.github-branch}} --field-selector metadata.name!={{workflow.name}}-my-pvc -n kubeflow); do kubectl patch $j -p ' "'" '{"metadata":{"finalizers": []}}' "'" ' -n kubeflow --type=merge; done' ]) patch_pvc_finalizer.after(delete_previous_pvc)